//====- ExtendDAPPass.cpp - Extend DAP Dialect Lowering Pass  -------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//
//
// This file defines Extend DAP dialect lowering pass.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Pass/Pass.h"

#include "DAP/DAPDialect.h"
#include "DAP/DAPOps.h"
#include <iostream>

using namespace mlir;
using namespace buddy;
using namespace vector;
using namespace mlir::arith;
using namespace mlir::linalg;
using namespace mlir::bufferization;
using namespace mlir::scf;

//===----------------------------------------------------------------------===//
// Rewrite Pattern
//===----------------------------------------------------------------------===//
Value initMelFilter(PatternRewriter &rewriter, Location loc, Value c0, Value c1,
                    Value f0) {
  FloatType f64Ty = rewriter.getF64Type();
  std::vector<double> data{
      0.024862593984176087,   0.0019908218880980706,  0.022871772096078023,
      0.003981643776196141,   0.020880950207979945,   0.005972465664294215,
      0.018890128319881874,   0.007963287552392284,   0.016899306431783803,
      0.00995410944049036,    0.014908484543685726,   0.011944931328588433,
      0.012917662655587655,   0.013935753216686492,   0.0109268407674896,
      0.015926575104784558,   0.008936018879391525,   0.017917396992882653,
      0.006945196991293433,   0.019908218880980738,   0.004954375103195362,
      0.021899040769078785,   0.0029635532150973053,  0.02388986265717686,
      0.000972731326999214,   0.025880684545274913,   0.025835325311175383,
      0.0010180905610987637,  0.023844503423077285,   0.003008912449196894,
      0.0218536815349792,     0.004999734337294947,   0.019862859646881146,
      0.00699055622539301,    0.017872037758783092,   0.008981378113491093,
      0.015881215870684983,   0.010972200001589204,   0.013890393982586886,
      0.012963021889687279,   0.011899572094488815,   0.01495384377778533,
      0.009908750206390747,   0.016944665665883398,   0.007917928318292721,
      0.01893548755398142,    0.00592710643019463,    0.020874010059259842,
      0.0040404255283634505,  0.02211421726709443,    0.003318606124028175,
      0.021736724240202378,   0.0036109675065762467,  0.020497701500567712,
      0.0047621938624405336,  0.018486659689787407,   0.00659261778657699,
      0.015856038061722075,   0.00896277117173217,    0.01273876809538182,
      0.011751329556399702,   0.009250369503549623,   0.014853145171184273,
      0.005490841259941731,   0.018177473406255133,   0.0015463665611462304,
      0.0028155461301291296,  0.016329520204672005,   0.0074201889869586046,
      0.011181051149095055,   0.012018863908450889,   0.006065350444974551,
      0.016561277418378373,   0.0010297985194884273,  0.004360878822945124,
      0.012770536755428743,   0.009707189146398206,   0.00698640273562069,
      0.014854299940667337,   0.0014180475372245899,  0.004391219533926463,
      0.011486922519974862,   0.010089744452471235,   0.005411105098683222,
      0.00040022287019040627, 0.01473556574143922,    0.006518189694660876,
      0.008278412940789993,   0.012277561276489102,   0.0021878083895293813,
      0.003967812818254116,   0.01018448003033658,    0.009981875463793392,
      0.0038694348523115605,  0.002286485205918727,   0.011274894758755125,
      0.00846622203459276,    0.00482029386344711,    0.0013397691078130697,
      0.01167825163503901,    0.0076086824024487005,  0.005156961757386885,
      0.0010091040034666294,  0.011507895445653202,   0.007301822420859842,
      0.00498210435635522,    0.00119016554305424,    0.010863499380371759,
      0.007451189902152603,   0.004385921781217367,   0.001791381421388157,
      0.009832492179588035,   0.007973956151288167,   0.003447454713364045,
      0.00273258990979288,    0.008491348038548032,   0.008797688393881514,
      0.00223576473247454,    0.003943828025486854,   0.00690675179681992,
      0.00985924113208423,    0.0008110002442122107,  0.005364237465034378,
      0.005136650837642653,   0.0008692337979845255,  0.009462301431073093,
      0.0069410774768914035,  0.0032312041128928558,  0.0027783994364693502,
      0.0072370497293947405,  0.008628834806348044,   0.0012336377872858904,
      0.004773912819246566,   0.004943322320664174,   0.0009189908321450893,
      0.008653006854042458,   0.006818502698028209,   0.0026164354511310182,
      0.0032485836741847724,  0.006051854749492107,   0.008880466967901061,
      0.0002863667968266877,  0.005574480003847867,   0.0034677978994882394,
      0.0022684930397946727,  0.006649229002149792,   0.007871464954585431,
      0.0009245911230515482,  0.004809896965650232,   0.003870811942679267,
      0.0017483289767150309,  0.006817032762306986,   0.007283343633866355,
      0.00117032890964782,    0.00444812418116144,    0.0038987290660258203,
      0.001612904728456526,   0.006627129222403821,   0.007047320122030166,
      0.001089299240400779,   0.004421714754438077,   0.003615982700014719,
      0.0017961093868459883,  0.006142666159628658,   0.007102936106246769,
      0.000739228059530935,   0.00467144758787073,    0.003079108186777991,
      0.002239959069494691,   0.005418988314025048,   0.007397185776342812,
      0.0001706867135296284,  0.005145462616431531,   0.0023375742945811327,
      0.0028937394565202498,  0.004504461875632637,   0.0006420162966089689,
      0.006671349456684142,   0.005798479593256033,   0.0014345343541053727,
      0.003713231338714718,   0.0034412191129693185,  0.0016279830841734021,
      0.005447903871833263,   0.006591092774765041,   0.0004075043790786325,
      0.004660011565279688,   0.0022658304669981606,  0.002728930355794336,
      0.0041241565549176885,  0.0007978491463089834,  0.0059824826428372165,
      0.005700822451952891,   0.001009910787399597,   0.003912510374718906,
      0.0027308466911522586,  0.0021241982974849216,  0.00445178259490492,
      0.00033588622025093644, 0.006172718498657583,   0.005150905140435021,
      0.001294369078080886,   0.003494806955712255,   0.002888072359801849,
      0.0018387087709894891,  0.004481775641522813,   0.0001826105862667237,
      0.006075478923243776,   0.004886660120436632,   0.0013131388257841057,
      0.0033530009608540383,  0.0027890160764299436,  0.001819341801271444,
      0.004264893327075782,   0.0002856826416888495,  0.00574077057772162,
      0.00485997904767977,    0.0011121684505728251,  0.003439706723569149,
      0.002478930810892349,   0.002019434399458528,   0.003845693171211873,
      0.0005991620753479068,  0.0052124555315313965,  0.005028639927429189,
      0.0007317548848252071,  0.0037133714976539506,  0.001997469461541363,
      0.002398103067878713,   0.0032631840382575193,  0.0010828346381034754,
      0.004528898614973675,   0.005355687096081724,   0.00020713973879943928,
      0.004137659389003323,   0.001379277219478372,   0.002919631681924923,
      0.0025514147001573046,  0.0017016039748465222,  0.0037235521808362372,
      0.0004835762677681217,  0.00489568966151517,    0.004680894973891053,
      0.0006545265135735772,  0.003552918766430338,   0.001740005261220558,
      0.002424942558969623,   0.0028254840088675383,  0.001296966351508908,
      0.003910962756514519,   0.0001689901440481931,  0.0049964415041615,
      0.004270978712160627,   0.0008546266928695816,  0.003226396296503344,
      0.001859853580513147,   0.00218181388084606,    0.002865080468156712,
      0.0011372314651887762,  0.003870307355800277,   9.264904953149247e-05,
      0.004875534243443842,   0.00408313784805493,    0.0008483414885398006,
      0.0031157837355450563,  0.0017792497148243077,  0.0021484296230351824,
      0.002710157941108815,   0.0011810755105253086,  0.003641066167393322,
      0.00021372139801543514, 0.00457197439367783,    0.004079728686464056,
      0.0006716204322456959,  0.0031838932167458007,  0.0015337045412328826,
      0.0022880577470275453,  0.0023957886502200695,  0.0013922222773092897,
      0.0032578727592072563,  0.0004963868075910343,  0.004119956868194443,
      0.004227725348621998,   0.00035597961441710105, 0.003398120989238948,
      0.0011543279262957885,  0.0025685166298558978,  0.0019526762381744762,
      0.0017389122704728477,  0.0027510245500531635,  0.0009093079110897976,
      0.0035493728619318513,  7.970355170674751e-05,  0.004347721173810539,
      0.0037299628548962886,  0.0006682946412839993,  0.0029616929924361443,
      0.001407619285405272,   0.0021934231299760003,  0.002146943929526545,
      0.001425153267515856,   0.0028862685736478176,  0.000656883405055712,
      0.0036255932177690904,  0.004154576575021074,   9.92650919068254e-05,
      0.00344310661360058,    0.0007839298194100073,  0.0027316366521800855,
      0.0014685945469131891,  0.002020166690759591,   0.002153259274416371,
      0.0013086967293390965,  0.002837924001919553,   0.0005972267679186017,
      0.0035225887294227346,  0.00399151753267442,    0.00010181095051366054,
      0.0033326481291776314,  0.0007358568907029797,  0.002673778725680842,
      0.0013699028308922986,  0.002014909322184054,   0.0020039487710816176,
      0.0013560399186872648,  0.002637994711270937,   0.0006971705151904762,
      0.003272040651460256,   3.8301111693687365e-05, 0.0039060865916495753,
      0.0033682563798377797,  0.000553036427908215,   0.0027580986579326967,
      0.0011402059404032124,  0.0021479409360276127,  0.0017273754528982098,
      0.0015377832141225299,  0.002314544965393207,   0.0009276254922174464,
      0.002901714477888205,   0.00031746777031236304, 0.003488883990383202,
      0.0035233401613614045,  0.0002608386658672823,  0.0029582927579996227,
      0.0008045974293106004,  0.0023932453546378412,  0.0013483561927539183,
      0.0018281979512760594,  0.0018921149561972363,  0.0012631505479142777,
      0.0024358737196405545,  0.0006981031445524962,  0.0029796324830838727,
      0.00013305574119071463, 0.003523391246527191,   0.0032513804428682767,
      0.0003849812001174835,  0.0027281082517878774,  0.0008885386678154236,
      0.0022048360607074776,  0.0013920961355133636,  0.0016815638696270783,
      0.0018956536032113034,  0.001158291678546679,   0.002399211070909244,
      0.0006350194874662795,  0.0029027685386071836,  0.0001117472963858801,
      0.003406326006305124,   0.0031327631853624,     0.0003667416797849643,
      0.002648177672131103,   0.0008330700230168817,  0.0021635921588998063,
      0.001299398366248799,   0.00167900664566851,    0.0017657267094807168,
      0.0011944211324372133,  0.002232055052712634,   0.0007098356192059166,
      0.0026983833959445514,  0.00022525010597461998, 0.003164711739176469,
      0.0031413131931234614,  0.002692554165534394,   0.002243795137945327,
      0.0017950361103562596,  0.001346277082767192,   0.0008975180551781247,
      0.0004487590275890572};
  Value melFilterData = rewriter.create<arith::ConstantOp>(
      loc, DenseFPElementsAttr::get(RankedTensorType::get(391, f64Ty),
                                    ArrayRef<double>(data)));

  IndexType idxTy = rewriter.getIndexType();
  std::vector<size_t> D1Index{
      1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,   8,
      8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
      16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,
      23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,
      31,  31,  32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,
      38,  39,  39,  40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,
      46,  46,  47,  47,  48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,
      53,  54,  54,  55,  55,  56,  56,  57,  57,  58,  58,  59,  59,  60,  60,
      61,  61,  62,  62,  63,  63,  64,  64,  65,  65,  66,  66,  67,  67,  68,
      68,  69,  69,  70,  70,  71,  71,  72,  72,  73,  73,  74,  74,  75,  75,
      76,  76,  77,  77,  78,  78,  79,  79,  80,  80,  81,  81,  82,  82,  83,
      83,  84,  84,  85,  85,  86,  86,  87,  87,  88,  88,  89,  89,  90,  90,
      91,  91,  92,  92,  93,  93,  94,  94,  95,  95,  96,  96,  97,  97,  98,
      98,  99,  99,  100, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105,
      106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111, 112, 112, 113,
      113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120,
      121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127, 128,
      128, 129, 129, 130, 130, 131, 131, 132, 132, 133, 133, 134, 134, 135, 135,
      136, 136, 137, 137, 138, 138, 139, 139, 140, 140, 141, 141, 142, 142, 143,
      143, 144, 144, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 150, 150,
      151, 151, 152, 152, 153, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158,
      158, 159, 159, 160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 165,
      166, 166, 167, 167, 168, 168, 169, 169, 170, 170, 171, 171, 172, 172, 173,
      173, 174, 174, 175, 175, 176, 176, 177, 177, 178, 178, 179, 179, 180, 180,
      181, 181, 182, 182, 183, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188,
      188, 189, 189, 190, 190, 191, 191, 192, 192, 193, 194, 195, 196, 197, 198,
      199};
  Value dim1Index = rewriter.create<arith::ConstantOp>(
      loc, DenseElementsAttr::get(RankedTensorType::get(391, idxTy),
                                  ArrayRef<size_t>(D1Index)));

  std::vector<size_t> D2Index{
      0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,
      9,  10, 10, 11, 11, 12, 12, 13, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19,
      19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28,
      28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 33, 34, 34, 35, 35, 36,
      36, 37, 36, 37, 37, 38, 38, 39, 38, 39, 39, 40, 39, 40, 40, 41, 41, 42,
      41, 42, 42, 43, 42, 43, 43, 44, 43, 44, 44, 45, 44, 45, 45, 46, 45, 46,
      46, 47, 46, 47, 47, 48, 47, 48, 48, 49, 48, 49, 49, 50, 49, 50, 49, 50,
      50, 51, 50, 51, 51, 52, 51, 52, 51, 52, 52, 53, 52, 53, 53, 54, 53, 54,
      53, 54, 54, 55, 54, 55, 54, 55, 55, 56, 55, 56, 55, 56, 56, 57, 56, 57,
      56, 57, 57, 58, 57, 58, 57, 58, 58, 59, 58, 59, 58, 59, 58, 59, 59, 60,
      59, 60, 59, 60, 60, 61, 60, 61, 60, 61, 60, 61, 61, 62, 61, 62, 61, 62,
      61, 62, 62, 63, 62, 63, 62, 63, 62, 63, 63, 64, 63, 64, 63, 64, 63, 64,
      64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 65, 66, 65, 66, 66, 67,
      66, 67, 66, 67, 66, 67, 66, 67, 67, 68, 67, 68, 67, 68, 67, 68, 67, 68,
      68, 69, 68, 69, 68, 69, 68, 69, 68, 69, 69, 70, 69, 70, 69, 70, 69, 70,
      69, 70, 70, 71, 70, 71, 70, 71, 70, 71, 70, 71, 71, 72, 71, 72, 71, 72,
      71, 72, 71, 72, 71, 72, 72, 73, 72, 73, 72, 73, 72, 73, 72, 73, 73, 74,
      73, 74, 73, 74, 73, 74, 73, 74, 73, 74, 74, 75, 74, 75, 74, 75, 74, 75,
      74, 75, 74, 75, 74, 75, 75, 76, 75, 76, 75, 76, 75, 76, 75, 76, 75, 76,
      76, 77, 76, 77, 76, 77, 76, 77, 76, 77, 76, 77, 76, 77, 77, 78, 77, 78,
      77, 78, 77, 78, 77, 78, 77, 78, 77, 78, 78, 79, 78, 79, 78, 79, 78, 79,
      78, 79, 78, 79, 78, 79, 79, 79, 79, 79, 79, 79, 79};
  Value dim2Index = rewriter.create<arith::ConstantOp>(
      loc, DenseElementsAttr::get(RankedTensorType::get(391, idxTy),
                                  ArrayRef<size_t>(D2Index)));

  RankedTensorType melFilterType = RankedTensorType::get({201, 80}, f64Ty);
  Value melFilter = rewriter.create<tensor::SplatOp>(loc, melFilterType, f0);
  auto mTp =
      MemRefType::get(melFilterType.getShape(), melFilterType.getElementType());
  Value melFilterMemRef =
      rewriter.create<bufferization::ToMemrefOp>(loc, mTp, melFilter);

  // TODO : remove tomemref & totensor, and use insert to replace store. !!
  Value c391 = rewriter.create<ConstantIndexOp>(loc, 391);
  Value number, d1, d2;
  // rewriter.create<scf::ForOp>(loc, c0, c391, c1, std::nullopt,
  //     [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
  //       number = builder.create<tensor::ExtractOp>(loc, melFilterData, iv);
  //       d1 = builder.create<tensor::ExtractOp>(loc, dim1Index, iv);
  //       d2 = builder.create<tensor::ExtractOp>(loc, dim2Index, iv);
  //       builder.create<memref::StoreOp>(loc, number, melFilterMemRef,
  //       ValueRange{d1, d2}); builder.create<scf::YieldOp>(loc, std::nullopt);
  //     });
  auto loopOp = rewriter.create<scf::ForOp>(loc, c0, c391, c1);
  rewriter.setInsertionPointToStart(loopOp.getBody());

  Value iv = loopOp.getInductionVar();
  number = rewriter.create<tensor::ExtractOp>(loc, melFilterData, iv);
  d1 = rewriter.create<tensor::ExtractOp>(loc, dim1Index, iv);
  d2 = rewriter.create<tensor::ExtractOp>(loc, dim2Index, iv);
  rewriter.create<memref::StoreOp>(loc, number, melFilterMemRef,
                                   ValueRange{d1, d2});

  rewriter.setInsertionPointAfter(loopOp);

  Value newMelFilter = rewriter.create<bufferization::ToTensorOp>(
      loc, melFilterMemRef, /*restrict=*/true, /*writable=*/false);

  return newMelFilter;
}

Value getHanningWindow400(PatternRewriter &rewriter, Location loc) {
  FloatType f64Ty = rewriter.getF64Type();
  std::vector<double> hanningWindow400{0.0,
                                       6.168375916970614e-05,
                                       0.0002467198171342,
                                       0.0005550625190150482,
                                       0.0009866357858642205,
                                       0.001541333133436018,
                                       0.002219017698460002,
                                       0.003019522272410202,
                                       0.0039426493427611176,
                                       0.0049881711417212315,
                                       0.00615582970243117,
                                       0.007445336922613066,
                                       0.00885637463565564,
                                       0.01038859468911707,
                                       0.012041619030626338,
                                       0.013815039801161721,
                                       0.015708419435684517,
                                       0.017721290771101017,
                                       0.019853157161528523,
                                       0.02210349260083494,
                                       0.024471741852423234,
                                       0.02695732058622735,
                                       0.029559615522887273,
                                       0.03227798458506631,
                                       0.035111757055874326,
                                       0.03806023374435674,
                                       0.04112268715800954,
                                       0.044298361682277465,
                                       0.04758647376699032,
                                       0.05098621211969223,
                                       0.054496737905816106,
                                       0.05811718495565327,
                                       0.06184665997806821,
                                       0.06568424278090434,
                                       0.06962898649802812,
                                       0.07367991782295402,
                                       0.07783603724899257,
                                       0.08209631931586497,
                                       0.08645971286271914,
                                       0.09092514128748835,
                                       0.09549150281252633,
                                       0.10015767075645471,
                                       0.1049224938121548,
                                       0.10978479633083521,
                                       0.11474337861210543,
                                       0.11979701719998453,
                                       0.1249444651847702,
                                       0.1301844525106951,
                                       0.13551568628929433,
                                       0.14093685111840565,
                                       0.14644660940672627,
                                       0.15204360170384285,
                                       0.15772644703565564,
                                       0.1634937432451133,
                                       0.16934406733817414,
                                       0.17527597583490823,
                                       0.18128800512565513,
                                       0.1873786718321474,
                                       0.1935464731735117,
                                       0.19978988733705805,
                                       0.2061073738537635,
                                       0.21249737397836072,
                                       0.21895831107393465,
                                       0.22548859100093405,
                                       0.23208660251050156,
                                       0.2387507176420256,
                                       0.24547929212481434,
                                       0.2522706657837962,
                                       0.2591231629491423,
                                       0.2660350928697134,
                                       0.2730047501302266,
                                       0.2800304150720424,
                                       0.28711035421746367,
                                       0.2942428206974456,
                                       0.30142605468260963,
                                       0.30865828381745525,
                                       0.31593772365766115,
                                       0.3232625781103715,
                                       0.3306310398773543,
                                       0.3380412909009253,
                                       0.34549150281252644,
                                       0.3529798373838481,
                                       0.3605044469803854,
                                       0.36806347501731357,
                                       0.3756550564175726,
                                       0.38327731807204724,
                                       0.39092837930172886,
                                       0.3986063523217438,
                                       0.4063093427071377,
                                       0.41403544986029517,
                                       0.4217827674798846,
                                       0.4295493840312088,
                                       0.4373333832178479,
                                       0.44513284445447737,
                                       0.45294584334074284,
                                       0.4607704521360776,
                                       0.4686047402353433,
                                       0.4764467746451787,
                                       0.48429462046093585,
                                       0.49214634134408974,
                                       0.5,
                                       0.5078536586559104,
                                       0.5157053795390641,
                                       0.5235532253548213,
                                       0.5313952597646567,
                                       0.5392295478639225,
                                       0.5470541566592572,
                                       0.5548671555455227,
                                       0.5626666167821522,
                                       0.5704506159687914,
                                       0.5782172325201155,
                                       0.5859645501397047,
                                       0.5936906572928624,
                                       0.6013936476782563,
                                       0.6090716206982714,
                                       0.6167226819279528,
                                       0.6243449435824273,
                                       0.6319365249826864,
                                       0.6394955530196147,
                                       0.647020162616152,
                                       0.6545084971874737,
                                       0.6619587090990747,
                                       0.6693689601226458,
                                       0.6767374218896286,
                                       0.6840622763423391,
                                       0.6913417161825449,
                                       0.6985739453173903,
                                       0.7057571793025544,
                                       0.7128896457825363,
                                       0.7199695849279575,
                                       0.7269952498697734,
                                       0.7339649071302867,
                                       0.7408768370508576,
                                       0.7477293342162038,
                                       0.7545207078751857,
                                       0.7612492823579744,
                                       0.7679133974894983,
                                       0.7745114089990659,
                                       0.7810416889260654,
                                       0.7875026260216393,
                                       0.7938926261462367,
                                       0.8002101126629421,
                                       0.8064535268264883,
                                       0.8126213281678527,
                                       0.8187119948743449,
                                       0.8247240241650918,
                                       0.8306559326618259,
                                       0.8365062567548867,
                                       0.8422735529643444,
                                       0.8479563982961571,
                                       0.8535533905932737,
                                       0.8590631488815944,
                                       0.8644843137107058,
                                       0.8698155474893048,
                                       0.8750555348152298,
                                       0.8802029828000155,
                                       0.8852566213878946,
                                       0.8902152036691648,
                                       0.8950775061878451,
                                       0.8998423292435453,
                                       0.9045084971874737,
                                       0.9090748587125117,
                                       0.9135402871372809,
                                       0.9179036806841352,
                                       0.9221639627510075,
                                       0.9263200821770461,
                                       0.9303710135019718,
                                       0.9343157572190957,
                                       0.9381533400219317,
                                       0.9418828150443468,
                                       0.9455032620941839,
                                       0.9490137878803078,
                                       0.9524135262330098,
                                       0.9557016383177226,
                                       0.9588773128419905,
                                       0.9619397662556434,
                                       0.9648882429441257,
                                       0.9677220154149337,
                                       0.9704403844771128,
                                       0.9730426794137726,
                                       0.9755282581475768,
                                       0.977896507399165,
                                       0.9801468428384715,
                                       0.982278709228899,
                                       0.9842915805643155,
                                       0.9861849601988383,
                                       0.9879583809693737,
                                       0.9896114053108829,
                                       0.9911436253643444,
                                       0.9925546630773869,
                                       0.9938441702975689,
                                       0.9950118288582788,
                                       0.996057350657239,
                                       0.9969804777275899,
                                       0.99778098230154,
                                       0.998458666866564,
                                       0.9990133642141358,
                                       0.9994449374809851,
                                       0.9997532801828658,
                                       0.9999383162408303,
                                       1.0,
                                       0.9999383162408303,
                                       0.9997532801828658,
                                       0.9994449374809851,
                                       0.9990133642141358,
                                       0.998458666866564,
                                       0.99778098230154,
                                       0.9969804777275899,
                                       0.996057350657239,
                                       0.9950118288582788,
                                       0.9938441702975689,
                                       0.9925546630773869,
                                       0.9911436253643444,
                                       0.9896114053108829,
                                       0.9879583809693737,
                                       0.9861849601988383,
                                       0.9842915805643155,
                                       0.982278709228899,
                                       0.9801468428384715,
                                       0.977896507399165,
                                       0.9755282581475768,
                                       0.9730426794137726,
                                       0.9704403844771128,
                                       0.9677220154149337,
                                       0.9648882429441257,
                                       0.9619397662556434,
                                       0.9588773128419905,
                                       0.9557016383177226,
                                       0.9524135262330098,
                                       0.9490137878803078,
                                       0.9455032620941839,
                                       0.9418828150443468,
                                       0.9381533400219317,
                                       0.9343157572190957,
                                       0.9303710135019718,
                                       0.9263200821770461,
                                       0.9221639627510075,
                                       0.9179036806841352,
                                       0.9135402871372809,
                                       0.9090748587125117,
                                       0.9045084971874737,
                                       0.8998423292435453,
                                       0.8950775061878451,
                                       0.8902152036691648,
                                       0.8852566213878946,
                                       0.8802029828000155,
                                       0.8750555348152298,
                                       0.8698155474893048,
                                       0.8644843137107058,
                                       0.8590631488815944,
                                       0.8535533905932737,
                                       0.8479563982961571,
                                       0.8422735529643444,
                                       0.8365062567548867,
                                       0.8306559326618259,
                                       0.8247240241650918,
                                       0.8187119948743449,
                                       0.8126213281678527,
                                       0.8064535268264883,
                                       0.8002101126629421,
                                       0.7938926261462367,
                                       0.7875026260216393,
                                       0.7810416889260654,
                                       0.7745114089990659,
                                       0.7679133974894983,
                                       0.7612492823579744,
                                       0.7545207078751857,
                                       0.7477293342162038,
                                       0.7408768370508576,
                                       0.7339649071302867,
                                       0.7269952498697734,
                                       0.7199695849279575,
                                       0.7128896457825363,
                                       0.7057571793025544,
                                       0.6985739453173903,
                                       0.6913417161825449,
                                       0.6840622763423391,
                                       0.6767374218896286,
                                       0.6693689601226458,
                                       0.6619587090990747,
                                       0.6545084971874737,
                                       0.647020162616152,
                                       0.6394955530196147,
                                       0.6319365249826864,
                                       0.6243449435824273,
                                       0.6167226819279528,
                                       0.6090716206982714,
                                       0.6013936476782563,
                                       0.5936906572928624,
                                       0.5859645501397047,
                                       0.5782172325201155,
                                       0.5704506159687914,
                                       0.5626666167821522,
                                       0.5548671555455227,
                                       0.5470541566592572,
                                       0.5392295478639225,
                                       0.5313952597646567,
                                       0.5235532253548213,
                                       0.5157053795390641,
                                       0.5078536586559104,
                                       0.5,
                                       0.49214634134408974,
                                       0.48429462046093585,
                                       0.4764467746451787,
                                       0.4686047402353433,
                                       0.4607704521360776,
                                       0.45294584334074284,
                                       0.44513284445447737,
                                       0.4373333832178479,
                                       0.4295493840312088,
                                       0.4217827674798846,
                                       0.41403544986029517,
                                       0.4063093427071377,
                                       0.3986063523217438,
                                       0.39092837930172886,
                                       0.38327731807204724,
                                       0.3756550564175726,
                                       0.36806347501731357,
                                       0.3605044469803854,
                                       0.3529798373838481,
                                       0.34549150281252644,
                                       0.3380412909009253,
                                       0.3306310398773543,
                                       0.3232625781103715,
                                       0.31593772365766115,
                                       0.30865828381745525,
                                       0.30142605468260963,
                                       0.2942428206974456,
                                       0.28711035421746367,
                                       0.2800304150720424,
                                       0.2730047501302266,
                                       0.2660350928697134,
                                       0.2591231629491423,
                                       0.2522706657837962,
                                       0.24547929212481434,
                                       0.2387507176420256,
                                       0.23208660251050156,
                                       0.22548859100093405,
                                       0.21895831107393465,
                                       0.21249737397836072,
                                       0.2061073738537635,
                                       0.19978988733705805,
                                       0.1935464731735117,
                                       0.1873786718321474,
                                       0.18128800512565513,
                                       0.17527597583490823,
                                       0.16934406733817414,
                                       0.1634937432451133,
                                       0.15772644703565564,
                                       0.15204360170384285,
                                       0.14644660940672627,
                                       0.14093685111840565,
                                       0.13551568628929433,
                                       0.1301844525106951,
                                       0.1249444651847702,
                                       0.11979701719998453,
                                       0.11474337861210543,
                                       0.10978479633083521,
                                       0.1049224938121548,
                                       0.10015767075645471,
                                       0.09549150281252633,
                                       0.09092514128748835,
                                       0.08645971286271914,
                                       0.08209631931586497,
                                       0.07783603724899257,
                                       0.07367991782295402,
                                       0.06962898649802812,
                                       0.06568424278090434,
                                       0.06184665997806821,
                                       0.05811718495565327,
                                       0.054496737905816106,
                                       0.05098621211969223,
                                       0.04758647376699032,
                                       0.044298361682277465,
                                       0.04112268715800954,
                                       0.03806023374435674,
                                       0.035111757055874326,
                                       0.03227798458506631,
                                       0.029559615522887273,
                                       0.02695732058622735,
                                       0.024471741852423234,
                                       0.02210349260083494,
                                       0.019853157161528523,
                                       0.017721290771101017,
                                       0.015708419435684517,
                                       0.013815039801161721,
                                       0.012041619030626338,
                                       0.01038859468911707,
                                       0.00885637463565564,
                                       0.007445336922613066,
                                       0.00615582970243117,
                                       0.0049881711417212315,
                                       0.0039426493427611176,
                                       0.003019522272410202,
                                       0.002219017698460002,
                                       0.001541333133436018,
                                       0.0009866357858642205,
                                       0.0005550625190150482,
                                       0.0002467198171342,
                                       6.168375916970614e-05};
  Value window = rewriter.create<arith::ConstantOp>(
      loc, DenseFPElementsAttr::get(RankedTensorType::get(400, f64Ty),
                                    ArrayRef<double>(hanningWindow400)));
  return window;
}

// Implement numpy reflect padding, low for left padding length, high for right
// padding length
Value padReflect(PatternRewriter &rewriter, Location loc, Value c0, Value c1,
                 Value input, int64_t low, int64_t high) {
  Value lowPadLen = rewriter.create<ConstantIndexOp>(loc, low);
  Value highPadLen = rewriter.create<ConstantIndexOp>(loc, high);
  SmallVector<OpFoldResult> lowValues;
  SmallVector<OpFoldResult> highValues;
  lowValues.push_back(lowPadLen);
  highValues.push_back(c0);

  FloatType f64Ty = rewriter.getF64Type();
  IndexType idxTy = rewriter.getIndexType();
  // Pad left part(low) for input tensor
  int64_t inputSize =
      llvm::cast<RankedTensorType>(input.getType()).getShape()[0];
  int64_t lowPaddedSize = inputSize + low;
  auto padOp1 = rewriter.create<tensor::PadOp>(
      loc, RankedTensorType::get(lowPaddedSize, f64Ty), input, lowValues,
      highValues);

  Region *padOpRegion1 = &padOp1.getRegion();
  int64_t sourceRank1 = llvm::cast<RankedTensorType>(input.getType()).getRank();
  SmallVector<Type> blockArgTypes1(sourceRank1, idxTy);
  SmallVector<Location> blockArgLocs1(sourceRank1, loc);

  // Create Block for padOp1 and insert operations
  OpBuilder::InsertPoint ip1(rewriter.saveInsertionPoint());
  rewriter.createBlock(padOpRegion1, padOpRegion1->end(), blockArgTypes1,
                       blockArgLocs1);
  Value iv1 = padOp1.getRegion().front().getArgument(0);
  Value idx1 = rewriter.create<arith::SubIOp>(loc, lowPadLen, iv1);
  Value elem1 = rewriter.create<tensor::ExtractOp>(loc, input, idx1);
  rewriter.create<tensor::YieldOp>(loc, elem1);
  rewriter.restoreInsertionPoint(ip1);
  lowValues.clear();
  highValues.clear();

  Value lowPaddedInput = padOp1.getResult();

  // Pad right part(high) for lowPaddedInput tensor
  lowValues.push_back(c0);
  highValues.push_back(highPadLen);
  int64_t highPaddedSize = lowPaddedSize + high;
  Value lowPaddedInputDim =
      rewriter.create<tensor::DimOp>(loc, lowPaddedInput, c0);
  Value symIndex = rewriter.create<arith::SubIOp>(loc, lowPaddedInputDim, c1);
  auto padOp2 = rewriter.create<tensor::PadOp>(
      loc, RankedTensorType::get(highPaddedSize, f64Ty), lowPaddedInput,
      lowValues, highValues);
  Region *padOpRegion2 = &padOp2.getRegion();
  int64_t sourceRank2 =
      llvm::cast<RankedTensorType>(lowPaddedInput.getType()).getRank();
  SmallVector<Type> blockArgTypes2(sourceRank2, idxTy);
  SmallVector<Location> blockArgLocs2(sourceRank2, loc);

  OpBuilder::InsertPoint ip2(rewriter.saveInsertionPoint());
  rewriter.createBlock(padOpRegion2, padOpRegion2->end(), blockArgTypes2,
                       blockArgLocs2);
  Value iv2 = padOp2.getRegion().front().getArgument(0);
  Value sub = rewriter.create<arith::SubIOp>(loc, iv2, symIndex);
  Value idx2 = rewriter.create<arith::SubIOp>(loc, symIndex, sub);
  Value elem2 = rewriter.create<tensor::ExtractOp>(loc, lowPaddedInput, idx2);
  rewriter.create<tensor::YieldOp>(loc, elem2);
  rewriter.restoreInsertionPoint(ip2);
  lowValues.clear();
  highValues.clear();

  return padOp2.getResult();
}

// function to print a memref for debug
void printMemref(OpBuilder &rewriter, Location loc, Value input, int l) {

  Value c0 = rewriter.create<ConstantIndexOp>(loc, 0);
  Value c1 = rewriter.create<ConstantIndexOp>(loc, 1);
  Value length = rewriter.create<ConstantIndexOp>(loc, l);
  rewriter.create<vector::PrintOp>(loc, "Print Start:\n");

  rewriter.create<scf::ForOp>(
      loc, c0, length, c1, std::nullopt,
      [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) {
        Value x = b.create<memref::LoadOp>(loc, input, i);
        b.create<vector::PrintOp>(loc, x);

        b.create<scf::YieldOp>(loc, std::nullopt);
      });

  rewriter.create<vector::PrintOp>(loc, "\n");
}

// WA CC CH PM MULPM C1 C1w C2 CH2 CH2w CH_radfg CCw CSARR AR AI IANG are helper
// functions for RFFTP
inline Value WA(OpBuilder &builder, Location loc, Value wa, Value x, Value i,
                Value ido, Value c1) {
  Value idom1 = builder.create<arith::SubIOp>(loc, ido, c1);
  Value tmp1 = builder.create<arith::MulIOp>(loc, x, idom1);
  Value index = builder.create<arith::AddIOp>(loc, tmp1, i);
  return builder.create<memref::LoadOp>(loc, wa, index);
}

inline Value CC(OpBuilder &builder, Location loc, Value cc, Value a, Value b,
                Value c, Value ido, Value l1) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, l1, c);
  Value tmp2 = builder.create<arith::AddIOp>(loc, tmp1, b);
  Value tmp3 = builder.create<arith::MulIOp>(loc, tmp2, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp3, a);
  return builder.create<memref::LoadOp>(loc, cc, index);
}

inline void CH(OpBuilder &builder, Location loc, Value ch, Value a, Value b,
               Value c, Value ido, Value cdim, Value toWrite) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, cdim, c);
  Value tmp2 = builder.create<arith::AddIOp>(loc, tmp1, b);
  Value tmp3 = builder.create<arith::MulIOp>(loc, tmp2, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp3, a);
  builder.create<memref::StoreOp>(loc, toWrite, ch, index);
  return;
}

inline std::vector<Value> PM(OpBuilder &builder, Location loc, Value c,
                             Value d) {
  return {builder.create<arith::AddFOp>(loc, c, d),
          builder.create<arith::SubFOp>(loc, c, d)};
}

inline std::vector<Value> MULPM(OpBuilder &builder, Location loc, Value c,
                                Value d, Value e, Value f) {
  Value tmp1 = builder.create<arith::MulFOp>(loc, c, e);
  Value tmp2 = builder.create<arith::MulFOp>(loc, d, f);
  Value tmp3 = builder.create<arith::MulFOp>(loc, c, f);
  Value tmp4 = builder.create<arith::MulFOp>(loc, d, e);
  return {builder.create<arith::AddFOp>(loc, tmp1, tmp2),
          builder.create<arith::SubFOp>(loc, tmp3, tmp4)};
}

inline Value C1(OpBuilder &builder, Location loc, Value cc, Value a, Value b,
                Value c, Value ido, Value l1) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, l1, c);
  Value tmp2 = builder.create<arith::AddIOp>(loc, tmp1, b);
  Value tmp3 = builder.create<arith::MulIOp>(loc, tmp2, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp3, a);
  return builder.create<memref::LoadOp>(loc, cc, index);
}

inline void C1w(OpBuilder &builder, Location loc, Value cc, Value a, Value b,
                Value c, Value ido, Value l1, Value toWrite) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, l1, c);
  Value tmp2 = builder.create<arith::AddIOp>(loc, tmp1, b);
  Value tmp3 = builder.create<arith::MulIOp>(loc, tmp2, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp3, a);
  builder.create<memref::StoreOp>(loc, toWrite, cc, index);
  return;
}

inline Value C2(OpBuilder &builder, Location loc, Value cc, Value a, Value b,
                Value idl1) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, idl1, b);
  Value index = builder.create<arith::AddIOp>(loc, tmp1, a);
  return builder.create<memref::LoadOp>(loc, cc, index);
}

inline Value CH2(OpBuilder &builder, Location loc, Value ch, Value a, Value b,
                 Value idl1) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, idl1, b);
  Value index = builder.create<arith::AddIOp>(loc, tmp1, a);
  return builder.create<memref::LoadOp>(loc, ch, index);
}

inline void CH2w(OpBuilder &builder, Location loc, Value ch, Value a, Value b,
                 Value idl1, Value toWrite) {
  Value tmp1 = builder.create<arith::MulIOp>(loc, idl1, b);
  Value index = builder.create<arith::AddIOp>(loc, tmp1, a);
  builder.create<memref::StoreOp>(loc, toWrite, ch, index);
  return;
}

inline Value CH_radfg(OpBuilder &builder, Location loc, Value ch, Value a,
                      Value b, Value c, Value ido, Value l1) {
  Value tmp = builder.create<arith::MulIOp>(loc, l1, c);
  Value tmp1 = builder.create<arith::AddIOp>(loc, b, tmp);
  Value tmp2 = builder.create<arith::MulIOp>(loc, tmp1, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp2, a);
  return builder.create<memref::LoadOp>(loc, ch, index);
}

inline void CCw(OpBuilder &builder, Location loc, Value cc, Value a, Value b,
                Value c, Value ido, Value cdim, Value toWrite) {
  Value tmp = builder.create<arith::MulIOp>(loc, cdim, c);
  Value tmp1 = builder.create<arith::AddIOp>(loc, b, tmp);
  Value tmp2 = builder.create<arith::MulIOp>(loc, tmp1, ido);
  Value index = builder.create<arith::AddIOp>(loc, tmp2, a);
  builder.create<memref::StoreOp>(loc, toWrite, cc, index);
  return;
}

inline Value CSARR(OpBuilder &builder, Location loc, Value csarr, Value index) {

  return builder.create<memref::LoadOp>(loc, csarr, index);
}

inline Value AR(OpBuilder &builder, Location loc, Value csarr, Value iang) {
  Value c2 = builder.create<ConstantIndexOp>(loc, 2);
  Value index = builder.create<arith::MulIOp>(loc, iang, c2);
  return CSARR(builder, loc, csarr, index);
}

inline Value AI(OpBuilder &builder, Location loc, Value csarr, Value iang) {
  Value c1 = builder.create<ConstantIndexOp>(loc, 1);
  Value c2 = builder.create<ConstantIndexOp>(loc, 2);
  Value tmp = builder.create<arith::MulIOp>(loc, iang, c2);
  Value index = builder.create<arith::AddIOp>(loc, tmp, c1);
  return CSARR(builder, loc, csarr, index);
}

inline Value IANG(OpBuilder &builder, Location loc, Value iang, Value l,
                  Value ip) {

  Value iang_new = builder.create<arith::AddIOp>(loc, iang, l);

  Value condition = builder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::sge, iang_new, ip);

  auto result = builder.create<scf::IfOp>(
      loc, condition,
      [&](OpBuilder &b, Location loc) {
        Value res = b.create<arith::SubIOp>(loc, iang_new, ip);
        b.create<scf::YieldOp>(loc, ValueRange{res});
      },
      [&](OpBuilder &b, Location loc) {
        b.create<scf::YieldOp>(loc, ValueRange{iang_new});
      });

  return result.getResult(0);
}

void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch,
                 Value wa, Value csarr, Value ido, Value ip, Value l1) {

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // TODO: remove c4?
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);

  Value cdim = opBuilder.create<arith::SubIOp>(loc, ip, c0);
  Value tmp0 = opBuilder.create<arith::AddIOp>(loc, ip, c1);
  Value ipph = opBuilder.create<arith::DivSIOp>(loc, tmp0, c2);
  Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);
  // TODO: remove the following values?
  // Value idom2 = opBuilder.create<arith::SubIOp>(loc, ido, c2);
  Value idl1 = opBuilder.create<arith::MulIOp>(loc, ido, l1);

  opBuilder.create<scf::ForOp>(
      loc, c0, idl1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value ik, ValueRange ik_args) {
        Value c2ik0 = C2(builder, loc, cc, ik, c0, idl1);
        CH2w(builder, loc, ch, ik, c0, idl1, c2ik0);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  opBuilder.create<scf::ForOp>(
      loc, c1, ipph, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value j, ValueRange j_args) {
        builder.create<scf::ForOp>(
            loc, c0, idl1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value ik, ValueRange ik_args) {
              Value c2ikj = C2(b, loc, cc, ik, j, idl1);
              Value ch2ik0 = CH2(b, loc, ch, ik, c0, idl1);
              Value ch2ik0_updated =
                  b.create<arith::AddFOp>(loc, ch2ik0, c2ikj);

              CH2w(b, loc, ch, ik, c0, idl1, ch2ik0_updated);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) {
        builder.create<scf::ForOp>(
            loc, c0, ido, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value chik0 = CH_radfg(b, loc, ch, i, k, c0, ido, l1);

              CCw(b, loc, cc, i, c0, k, ido, cdim, chik0);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value j_start_0 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value jc_start_0 = opBuilder.create<arith::SubIOp>(loc, ip, c1);

  opBuilder.create<scf::ForOp>(
      loc, c1, ipph, c1, ValueRange{j_start_0, jc_start_0},
      [&](OpBuilder &builder, Location loc, Value j_loop,
          ValueRange j_loop_args) {
        Value j = j_loop_args[0];
        Value jc = j_loop_args[1];

        Value tmp = builder.create<arith::MulIOp>(loc, j, c2);
        Value j2 = builder.create<arith::SubIOp>(loc, tmp, c1);
        Value j2p1 = builder.create<arith::AddIOp>(loc, j2, c1);

        builder.create<scf::ForOp>(
            loc, c0, l1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value k, ValueRange k_args) {
              Value ch0kj = CH_radfg(b, loc, ch, c0, k, j, ido, l1);
              CCw(b, loc, cc, idom1, j2, k, ido, cdim, ch0kj);

              Value ch0kjc = CH_radfg(b, loc, ch, c0, k, jc, ido, l1);
              CCw(b, loc, cc, c0, j2p1, k, ido, cdim, ch0kjc);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value j_next = builder.create<arith::AddIOp>(loc, j, c1);
        Value jc_next = builder.create<arith::SubIOp>(loc, jc, c1);
        builder.create<scf::YieldOp>(loc, std::vector<Value>{j_next, jc_next});
      });

  Value condition1 =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, ido, l1);

  opBuilder.create<scf::IfOp>(
      loc, condition1, [&](OpBuilder &builder, Location loc) {
        Value j_start_1 = opBuilder.create<ConstantIndexOp>(loc, 1);
        Value jc_start_1 = opBuilder.create<arith::SubIOp>(loc, ip, c1);

        builder.create<scf::ForOp>(
            loc, c1, ipph, c1, ValueRange{j_start_1, jc_start_1},
            [&](OpBuilder &b, Location loc, Value j_loop,
                ValueRange j_loop_args) {
              Value j = j_loop_args[0];
              Value jc = j_loop_args[1];

              Value tmp = b.create<arith::MulIOp>(loc, j, c2);
              Value j2 = b.create<arith::SubIOp>(loc, tmp, c1);
              Value j2p1 = b.create<arith::AddIOp>(loc, j2, c1);

              b.create<scf::ForOp>(
                  loc, c0, l1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value k, ValueRange k_args) {
                    Value i_start_0 = b2.create<ConstantIndexOp>(loc, 1);
                    Value ic_start_0 = b2.create<arith::SubIOp>(loc, ido, c3);

                    b2.create<scf::ForOp>(
                        loc, c1, idom1, c2, ValueRange{i_start_0, ic_start_0},
                        [&](OpBuilder &b3, Location loc, Value i_loop,
                            ValueRange i_loop_args) {
                          Value i = i_loop_args[0];
                          Value ic = i_loop_args[1];

                          Value ip1 = b3.create<arith::AddIOp>(loc, i, c1);
                          Value icp1 = b3.create<arith::AddIOp>(loc, ic, c1);

                          Value chikj = CH_radfg(b3, loc, ch, i, k, j, ido, l1);
                          Value chikjc =
                              CH_radfg(b3, loc, ch, i, k, jc, ido, l1);
                          Value tmp2 =
                              b3.create<arith::AddFOp>(loc, chikj, chikjc);
                          Value tmp3 =
                              b3.create<arith::SubFOp>(loc, chikj, chikjc);
                          CCw(b3, loc, cc, i, j2p1, k, ido, cdim, tmp2);
                          CCw(b3, loc, cc, ic, j2, k, ido, cdim, tmp3);

                          Value chip1kj =
                              CH_radfg(b3, loc, ch, ip1, k, j, ido, l1);
                          Value chip1kjc =
                              CH_radfg(b3, loc, ch, ip1, k, jc, ido, l1);
                          Value tmp4 =
                              b3.create<arith::AddFOp>(loc, chip1kj, chip1kjc);
                          Value tmp5 =
                              b3.create<arith::SubFOp>(loc, chip1kjc, chip1kj);
                          CCw(b3, loc, cc, ip1, j2p1, k, ido, cdim, tmp4);
                          CCw(b3, loc, cc, icp1, j2, k, ido, cdim, tmp5);

                          Value i_next = b3.create<arith::AddIOp>(loc, i, c2);
                          Value ic_next = b3.create<arith::SubIOp>(loc, ic, c2);
                          b3.create<scf::YieldOp>(
                              loc, std::vector<Value>{i_next, ic_next});
                        });
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              Value j_next = b.create<arith::AddIOp>(loc, j, c1);
              Value jc_next = b.create<arith::SubIOp>(loc, jc, c1);
              b.create<scf::YieldOp>(loc, std::vector<Value>{j_next, jc_next});
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

// Handle general radix FFT computation.
void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa,
           Value csarr, Value ido, Value ip, Value l1) {

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);

  Value ipm1 = opBuilder.create<arith::SubIOp>(loc, ip, c1);
  Value ipm2 = opBuilder.create<arith::SubIOp>(loc, ip, c2);

  // TODO: remove the following values?
  // Value cdim = opBuilder.create<arith::SubIOp>(loc, ip, c0);
  Value tmp = opBuilder.create<arith::AddIOp>(loc, ip, c1);
  Value ipph = opBuilder.create<arith::DivSIOp>(loc, tmp, c2);

  Value idl1 = opBuilder.create<arith::MulIOp>(loc, ido, l1);
  Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);
  // TODO: remove idom2?
  // Value idom2 = opBuilder.create<arith::SubIOp>(loc, ido, c2);

  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, ido, l1);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        Value jc_start = builder.create<arith::SubIOp>(loc, ip, c1);

        builder.create<scf::ForOp>(
            loc, c1, ipph, c1, ValueRange{jc_start},
            [&](OpBuilder &b, Location loc, Value j, ValueRange j_args) {
              Value jc = j_args[0];

              Value jm1 = b.create<arith::SubIOp>(loc, j, c1);
              Value jcm1 = b.create<arith::SubIOp>(loc, jc, c1);

              Value is = b.create<arith::MulIOp>(loc, jm1, idom1);
              Value is2 = b.create<arith::MulIOp>(loc, jcm1, idom1);

              b.create<scf::ForOp>(
                  loc, c0, l1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value k, ValueRange k_args) {
                    Value idij_start = b2.create<arith::SubIOp>(loc, is, c0);
                    Value idij2_start = b2.create<arith::SubIOp>(loc, is2, c0);

                    b2.create<scf::ForOp>(
                        loc, c1, idom1, c2, ValueRange{idij_start, idij2_start},
                        [&](OpBuilder &b3, Location loc, Value i,
                            ValueRange i_args) {
                          Value idij = i_args[0];
                          Value idij2 = i_args[1];

                          Value ip1 = b3.create<arith::AddIOp>(loc, i, c1);
                          Value idijp1 =
                              b3.create<arith::AddIOp>(loc, idij, c1);
                          Value idij2p1 =
                              b3.create<arith::AddIOp>(loc, idij2, c1);

                          Value t1 = C1(b3, loc, cc, i, k, j, ido, l1);
                          Value t2 = C1(b3, loc, cc, ip1, k, j, ido, l1);
                          Value t3 = C1(b3, loc, cc, i, k, jc, ido, l1);
                          Value t4 = C1(b3, loc, cc, ip1, k, jc, ido, l1);

                          Value waidij =
                              b3.create<memref::LoadOp>(loc, wa, idij);
                          Value waidijp1 =
                              b3.create<memref::LoadOp>(loc, wa, idijp1);
                          Value waidij2 =
                              b3.create<memref::LoadOp>(loc, wa, idij2);
                          Value waidij2p1 =
                              b3.create<memref::LoadOp>(loc, wa, idij2p1);

                          Value tmp1_x1 =
                              b3.create<arith::MulFOp>(loc, waidij, t1);
                          Value tmp2_x1 =
                              b3.create<arith::MulFOp>(loc, waidijp1, t2);
                          Value x1 =
                              b3.create<arith::AddFOp>(loc, tmp1_x1, tmp2_x1);

                          Value tmp1_x2 =
                              b3.create<arith::MulFOp>(loc, waidij, t2);
                          Value tmp2_x2 =
                              b3.create<arith::MulFOp>(loc, waidijp1, t1);
                          Value x2 =
                              b3.create<arith::SubFOp>(loc, tmp1_x2, tmp2_x2);

                          Value tmp1_x3 =
                              b3.create<arith::MulFOp>(loc, waidij2, t3);
                          Value tmp2_x3 =
                              b3.create<arith::MulFOp>(loc, waidij2p1, t4);
                          Value x3 =
                              b3.create<arith::AddFOp>(loc, tmp1_x3, tmp2_x3);

                          Value tmp1_x4 =
                              b3.create<arith::MulFOp>(loc, waidij2, t4);
                          Value tmp2_x4 =
                              b3.create<arith::MulFOp>(loc, waidij2p1, t3);
                          Value x4 =
                              b3.create<arith::SubFOp>(loc, tmp1_x4, tmp2_x4);

                          Value tmp3 = b3.create<arith::AddFOp>(loc, x1, x3);
                          Value tmp4 = b3.create<arith::SubFOp>(loc, x2, x4);
                          Value tmp5 = b3.create<arith::AddFOp>(loc, x2, x4);
                          Value tmp6 = b3.create<arith::SubFOp>(loc, x3, x1);

                          C1w(b3, loc, cc, i, k, j, ido, l1, tmp3);
                          C1w(b3, loc, cc, i, k, jc, ido, l1, tmp4);
                          C1w(b3, loc, cc, ip1, k, j, ido, l1, tmp5);
                          C1w(b3, loc, cc, ip1, k, jc, ido, l1, tmp6);

                          Value idij_next =
                              b3.create<arith::AddIOp>(loc, idij, c2);
                          Value idij2_next =
                              b3.create<arith::AddIOp>(loc, idij2, c2);

                          b3.create<scf::YieldOp>(
                              loc, std::vector<Value>{idij_next, idij2_next});
                        });
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  }

              );

              Value jc_next = b.create<arith::SubIOp>(loc, jc, c1);
              b.create<scf::YieldOp>(loc, jc_next);
            });

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value jc_a_start = opBuilder.create<arith::SubIOp>(loc, ip, c1);

  opBuilder.create<scf::ForOp>(
      loc, c1, ipph, c1, ValueRange{jc_a_start},
      [&](OpBuilder &builder, Location loc, Value j_a, ValueRange j_a_args) {
        Value jc_a = j_a_args[0];

        builder.create<scf::ForOp>(
            loc, c0, l1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value k_a, ValueRange k_a_args) {
              Value t1_a = C1(b, loc, cc, c0, k_a, j_a, ido, l1);
              Value t2_a = C1(b, loc, cc, c0, k_a, jc_a, ido, l1);

              Value tmp_a = b.create<arith::AddFOp>(loc, t1_a, t2_a);
              Value tmp1_a = b.create<arith::SubFOp>(loc, t2_a, t1_a);

              C1w(b, loc, cc, c0, k_a, j_a, ido, l1, tmp_a);
              C1w(b, loc, cc, c0, k_a, jc_a, ido, l1, tmp1_a);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value jc_a_next = builder.create<arith::SubIOp>(loc, jc_a, c1);
        builder.create<scf::YieldOp>(loc, jc_a_next);
      });

  Value lc_b_start = opBuilder.create<arith::SubIOp>(loc, ip, c1);

  opBuilder.create<scf::ForOp>(
      loc, c1, ipph, c1, ValueRange{lc_b_start},
      [&](OpBuilder &builder, Location loc, Value l_b, ValueRange l_b_args) {
        Value lc_b = l_b_args[0];

        builder.create<scf::ForOp>(
            loc, c0, idl1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value ik_b, ValueRange ik_b_args) {
              Value m2l = b.create<arith::MulIOp>(loc, l_b, c2);
              Value m4l = b.create<arith::MulIOp>(loc, l_b, c4);
              Value m2lp1 = b.create<arith::AddIOp>(loc, m2l, c1);
              Value m4lp1 = b.create<arith::AddIOp>(loc, m4l, c1);

              Value csarr2l = CSARR(b, loc, csarr, m2l);
              Value csarr4l = CSARR(b, loc, csarr, m4l);
              Value csarr2lp1 = CSARR(b, loc, csarr, m2lp1);
              Value csarr4lp1 = CSARR(b, loc, csarr, m4lp1);

              Value c2ik0 = C2(b, loc, cc, ik_b, c0, idl1);
              Value c2ik1 = C2(b, loc, cc, ik_b, c1, idl1);
              Value c2ik2 = C2(b, loc, cc, ik_b, c2, idl1);

              Value c2ikipm1 = C2(b, loc, cc, ik_b, ipm1, idl1);
              Value c2ikipm2 = C2(b, loc, cc, ik_b, ipm2, idl1);

              Value tmp_b = b.create<arith::MulFOp>(loc, csarr2l, c2ik1);
              Value tmp1_b = b.create<arith::MulFOp>(loc, csarr4l, c2ik2);
              Value tmp2_b = b.create<arith::AddFOp>(loc, tmp_b, tmp1_b);
              Value tmp3_b = b.create<arith::AddFOp>(loc, c2ik0, tmp2_b);

              CH2w(b, loc, ch, ik_b, l_b, idl1, tmp3_b);

              Value tmp4_b = b.create<arith::MulFOp>(loc, csarr2lp1, c2ikipm1);
              Value tmp5_b = b.create<arith::MulFOp>(loc, csarr4lp1, c2ikipm2);
              Value tmp6_b = b.create<arith::AddFOp>(loc, tmp4_b, tmp5_b);

              CH2w(b, loc, ch, ik_b, lc_b, idl1, tmp6_b);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value iang_start_c = builder.create<arith::MulIOp>(loc, c2, l_b);
        Value j_start_c = builder.create<ConstantIndexOp>(loc, 3);
        Value jc_start_c = builder.create<arith::SubIOp>(loc, ip, c3);
        Value ipphm1 = builder.create<arith::SubIOp>(loc, ipph, c1);
        Value ipphm3 = builder.create<arith::SubIOp>(loc, ipph, c3);

        auto loop1 = builder.create<scf::ForOp>(
            loc, j_start_c, ipphm3, c4,
            ValueRange{j_start_c, jc_start_c, iang_start_c},
            [&](OpBuilder &b, Location loc, Value j_loop,
                ValueRange j_loop_args) {
              Value j = j_loop_args[0];
              Value jc = j_loop_args[1];
              Value iang = j_loop_args[2];

              Value iang_1_c = IANG(b, loc, iang, l_b, ip);
              Value ar1 = AR(b, loc, csarr, iang_1_c);
              Value ai1 = AI(b, loc, csarr, iang_1_c);

              Value iang_2_c = IANG(b, loc, iang_1_c, l_b, ip);
              Value ar2 = AR(b, loc, csarr, iang_2_c);
              Value ai2 = AI(b, loc, csarr, iang_2_c);

              Value iang_3_c = IANG(b, loc, iang_2_c, l_b, ip);
              Value ar3 = AR(b, loc, csarr, iang_3_c);
              Value ai3 = AI(b, loc, csarr, iang_3_c);

              Value iang_4_c = IANG(b, loc, iang_3_c, l_b, ip);
              Value ar4 = AR(b, loc, csarr, iang_4_c);
              Value ai4 = AI(b, loc, csarr, iang_4_c);

              b.create<scf::ForOp>(
                  loc, c0, idl1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value ik_c,
                      ValueRange ik_c_args) {
                    Value jp1 = b2.create<arith::AddIOp>(loc, j, c1);
                    Value jp2 = b2.create<arith::AddIOp>(loc, j, c2);
                    Value jp3 = b2.create<arith::AddIOp>(loc, j, c3);
                    // TODO: remove the following values?
                    // Value jm1 = b2.create<arith::SubIOp>(loc, j, c1);
                    // Value jm2 = b2.create<arith::SubIOp>(loc, j, c2);
                    // Value jm3 = b2.create<arith::SubIOp>(loc, j, c3);

                    Value c2ikj = C2(b2, loc, cc, ik_c, j, idl1);
                    Value c2ikjp1 = C2(b2, loc, cc, ik_c, jp1, idl1);
                    Value c2ikjp2 = C2(b2, loc, cc, ik_c, jp2, idl1);
                    Value c2ikjp3 = C2(b2, loc, cc, ik_c, jp3, idl1);

                    Value tmp_c = b2.create<arith::MulFOp>(loc, ar1, c2ikj);
                    Value tmp1_c = b2.create<arith::MulFOp>(loc, ar2, c2ikjp1);
                    Value tmp2_c = b2.create<arith::MulFOp>(loc, ar3, c2ikjp2);
                    Value tmp3_c = b2.create<arith::MulFOp>(loc, ar4, c2ikjp3);

                    Value tmp4_c = b2.create<arith::AddFOp>(loc, tmp_c, tmp1_c);
                    Value tmp5_c =
                        b2.create<arith::AddFOp>(loc, tmp4_c, tmp2_c);
                    Value tmp6_c =
                        b2.create<arith::AddFOp>(loc, tmp5_c, tmp3_c);

                    Value ch2ikl = CH2(b2, loc, ch, ik_c, l_b, idl1);
                    Value tmp7_c =
                        b2.create<arith::AddFOp>(loc, tmp6_c, ch2ikl);
                    CH2w(b2, loc, ch, ik_c, l_b, idl1, tmp7_c);

                    Value jcm1 = b2.create<arith::SubIOp>(loc, jc, c1);
                    Value jcm2 = b2.create<arith::SubIOp>(loc, jc, c2);
                    Value jcm3 = b2.create<arith::SubIOp>(loc, jc, c3);

                    Value c2ikjc = C2(b2, loc, cc, ik_c, jc, idl1);
                    Value c2ikjcm1 = C2(b2, loc, cc, ik_c, jcm1, idl1);
                    Value c2ikjcm2 = C2(b2, loc, cc, ik_c, jcm2, idl1);
                    Value c2ikjcm3 = C2(b2, loc, cc, ik_c, jcm3, idl1);

                    Value tmp_ai1 = b2.create<arith::MulFOp>(loc, ai1, c2ikjc);
                    Value tmp_ai2 =
                        b2.create<arith::MulFOp>(loc, ai2, c2ikjcm1);
                    Value tmp_ai3 =
                        b2.create<arith::MulFOp>(loc, ai3, c2ikjcm2);
                    Value tmp_ai4 =
                        b2.create<arith::MulFOp>(loc, ai4, c2ikjcm3);

                    Value tmp_ai5 =
                        b2.create<arith::AddFOp>(loc, tmp_ai1, tmp_ai2);
                    Value tmp_ai6 =
                        b2.create<arith::AddFOp>(loc, tmp_ai5, tmp_ai3);
                    Value tmp_ai7 =
                        b2.create<arith::AddFOp>(loc, tmp_ai6, tmp_ai4);

                    Value ch2iklc = CH2(b2, loc, ch, ik_c, lc_b, idl1);
                    Value tmp_ai8 =
                        b2.create<arith::AddFOp>(loc, tmp_ai7, ch2iklc);
                    CH2w(b2, loc, ch, ik_c, lc_b, idl1, tmp_ai8);

                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              Value j_next = b.create<arith::AddIOp>(loc, j, c4);
              Value jc_next = b.create<arith::SubIOp>(loc, jc, c4);
              builder.create<scf::YieldOp>(
                  loc, std::vector<Value>{j_next, jc_next, iang_4_c});
            });

        Value j_1_c = loop1.getResults()[0];
        Value jc_1_c = loop1.getResults()[1];
        Value iang1_c = loop1.getResults()[2];

        auto loop2 = builder.create<scf::ForOp>(
            loc, j_1_c, ipphm1, c2, ValueRange{j_1_c, jc_1_c, iang1_c},
            [&](OpBuilder &b, Location loc, Value j_loop,
                ValueRange j_loop_args) {
              Value j = j_loop_args[0];
              Value jc = j_loop_args[1];
              Value iang = j_loop_args[2];

              Value iang_1_d = IANG(b, loc, iang, l_b, ip);
              Value ar1 = AR(b, loc, csarr, iang_1_d);
              Value ai1 = AI(b, loc, csarr, iang_1_d);

              Value iang_2_d = IANG(b, loc, iang_1_d, l_b, ip);
              Value ar2 = AR(b, loc, csarr, iang_2_d);
              Value ai2 = AI(b, loc, csarr, iang_2_d);

              b.create<scf::ForOp>(
                  loc, c0, idl1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value ik_d,
                      ValueRange ik_d_args) {
                    Value jp1 = b2.create<arith::AddIOp>(loc, j, c1);
                    // TODO: remove jm1?
                    // Value jm1 = b2.create<arith::SubIOp>(loc, j, c1);

                    Value c2ikj = C2(b2, loc, cc, ik_d, j, idl1);
                    Value c2ikjp1 = C2(b2, loc, cc, ik_d, jp1, idl1);

                    Value tmp_c = b2.create<arith::MulFOp>(loc, ar1, c2ikj);
                    Value tmp1_c = b2.create<arith::MulFOp>(loc, ar2, c2ikjp1);
                    Value tmp2_c = b2.create<arith::AddFOp>(loc, tmp_c, tmp1_c);

                    Value ch2ikl = CH2(b2, loc, ch, ik_d, l_b, idl1);
                    Value tmp3_c =
                        b2.create<arith::AddFOp>(loc, tmp2_c, ch2ikl);
                    CH2w(b2, loc, ch, ik_d, l_b, idl1, tmp3_c);

                    Value jcm1 = b2.create<arith::SubIOp>(loc, jc, c1);
                    Value c2ikjc = C2(b2, loc, cc, ik_d, jc, idl1);
                    Value c2ikjcm1 = C2(b2, loc, cc, ik_d, jcm1, idl1);

                    Value tmp_ai1 = b2.create<arith::MulFOp>(loc, ai1, c2ikjc);
                    Value tmp_ai2 =
                        b2.create<arith::MulFOp>(loc, ai2, c2ikjcm1);
                    Value tmp_ai3 =
                        b2.create<arith::AddFOp>(loc, tmp_ai1, tmp_ai2);

                    Value ch2iklc = CH2(b2, loc, ch, ik_d, lc_b, idl1);
                    Value tmp_ai4 =
                        b2.create<arith::AddFOp>(loc, tmp_ai3, ch2iklc);
                    CH2w(b2, loc, ch, ik_d, lc_b, idl1, tmp_ai4);

                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              Value j_next = b.create<arith::AddIOp>(loc, j, c2);
              Value jc_next = b.create<arith::SubIOp>(loc, jc, c2);
              builder.create<scf::YieldOp>(
                  loc, std::vector<Value>{j_next, jc_next, iang_2_d});
            });

        Value j_2_c = loop2.getResults()[0];
        Value jc_2_c = loop2.getResults()[1];
        Value iang2_c = loop2.getResults()[2];

        builder.create<scf::ForOp>(
            loc, j_2_c, ipph, c1, ValueRange{j_2_c, jc_2_c, iang2_c},
            [&](OpBuilder &b, Location loc, Value j_loop,
                ValueRange j_loop_args) {
              Value j = j_loop_args[0];
              Value jc = j_loop_args[1];
              Value iang = j_loop_args[2];

              Value iang_1_e = IANG(b, loc, iang, l_b, ip);
              Value ar = AR(b, loc, csarr, iang_1_e);
              Value ai = AI(b, loc, csarr, iang_1_e);

              b.create<scf::ForOp>(
                  loc, c0, idl1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value ik_e,
                      ValueRange ik_e_args) {
                    Value c2ikj = C2(b2, loc, cc, ik_e, j, idl1);
                    Value tmp_c = b2.create<arith::MulFOp>(loc, ar, c2ikj);
                    Value ch2ikl = CH2(b2, loc, ch, ik_e, l_b, idl1);
                    Value tmp2_c = b2.create<arith::AddFOp>(loc, tmp_c, ch2ikl);
                    CH2w(b2, loc, ch, ik_e, l_b, idl1, tmp2_c);

                    Value c2ikjc = C2(b2, loc, cc, ik_e, jc, idl1);
                    Value tmp_ai = b2.create<arith::MulFOp>(loc, ai, c2ikjc);
                    Value ch2iklc = CH2(b2, loc, ch, ik_e, lc_b, idl1);
                    Value tmp2_ai =
                        b2.create<arith::AddFOp>(loc, tmp_ai, ch2iklc);
                    CH2w(b2, loc, ch, ik_e, lc_b, idl1, tmp2_ai);

                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              Value j_next = b.create<arith::AddIOp>(loc, j, c2);
              Value jc_next = b.create<arith::SubIOp>(loc, jc, c2);
              builder.create<scf::YieldOp>(
                  loc, std::vector<Value>{j_next, jc_next, iang_1_e});
            });

        Value lc_b_next = builder.create<arith::SubIOp>(loc, lc_b, c1);
        builder.create<scf::YieldOp>(loc, lc_b_next);
      });

  radfgExtend(opBuilder, loc, cc, ch, wa, csarr, ido, ip, l1);
}

void radf2Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch,
                 Value wa, Value ido, Value l1, Value cdim) {
  // TODO: remove f64Ty?
  // FloatType f64Ty = opBuilder.getF64Type();

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove the following values?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c20 = opBuilder.create<ConstantIndexOp>(loc, 20);

  // Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) {
        builder.create<scf::ForOp>(
            loc, c2, ido, c2, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value ic = b.create<arith::SubIOp>(loc, ido, i);
              Value icm1 = b.create<arith::SubIOp>(loc, ic, c1);
              Value im1 = b.create<arith::SubIOp>(loc, i, c1);
              Value im2 = b.create<arith::SubIOp>(loc, i, c2);

              Value wa0im2 = WA(b, loc, wa, c0, im2, ido, c1);
              Value wa0im1 = WA(b, loc, wa, c0, im1, ido, c1);
              Value ccim1k1 = CC(b, loc, cc, im1, k, c1, ido, l1);
              Value ccik1 = CC(b, loc, cc, i, k, c1, ido, l1);
              std::vector<Value> tr2_ti2 =
                  MULPM(b, loc, wa0im2, wa0im1, ccim1k1, ccik1);

              Value ccim1k0 = CC(b, loc, cc, im1, k, c0, ido, l1);
              Value ccik0 = CC(b, loc, cc, i, k, c0, ido, l1);
              std::vector<Value> ccim1k0_tr2 = PM(b, loc, ccim1k0, tr2_ti2[0]);
              std::vector<Value> ti2_ccik0 = PM(b, loc, tr2_ti2[1], ccik0);

              CH(b, loc, ch, im1, c0, k, ido, cdim, ccim1k0_tr2[0]);
              CH(b, loc, ch, icm1, c1, k, ido, cdim, ccim1k0_tr2[1]);

              CH(b, loc, ch, i, c0, k, ido, cdim, ti2_ccik0[0]);
              CH(b, loc, ch, ic, c1, k, ido, cdim, ti2_ccik0[1]);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

// Handle radix-2 FFT computation
void radf2(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa,
           Value ido, Value l1) {
  // TODO: remove the f64Ty?
  // FloatType f64Ty = opBuilder.getF64Type();
  Value cdim = opBuilder.create<ConstantIndexOp>(loc, 2);

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove the following values?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c20 = opBuilder.create<ConstantIndexOp>(loc, 20);

  Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iv_args) {
        Value cc0k0 = CC(builder, loc, cc, c0, iv, c0, ido, l1);
        Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1);
        std::vector<Value> cc0k0_cc0k1 = PM(builder, loc, cc0k0, cc0k1);
        CH(builder, loc, ch, c0, c0, iv, ido, cdim, cc0k0_cc0k1[0]);
        CH(builder, loc, ch, idom1, c1, iv, ido, cdim, cc0k0_cc0k1[1]);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value flag = opBuilder.create<arith::RemSIOp>(loc, ido, c2);
  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, flag, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        builder.create<scf::ForOp>(
            loc, c0, l1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value k, ValueRange k_args) {
              Value ccidom1k1 = CC(b, loc, cc, idom1, k, c1, ido, l1);
              Value tmp = b.create<arith::NegFOp>(loc, ccidom1k1);
              CH(b, loc, ch, c0, c1, k, ido, cdim, tmp);
              Value ccidom1k0 = CC(b, loc, cc, idom1, k, c0, ido, l1);
              CH(b, loc, ch, idom1, c0, k, ido, cdim, ccidom1k0);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value condition1 =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, ido, c2);
  opBuilder.create<scf::IfOp>(
      loc, condition1, [&](OpBuilder &builder, Location loc) {
        radf2Extend(builder, loc, cc, ch, wa, ido, l1, cdim);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

void radf3Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch,
                 Value wa, Value ido, Value l1, Value cdim) {

  FloatType f64Ty = opBuilder.getF64Type();
  Value taur =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(-0.5)), f64Ty);
  Value taui = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(0.86602540378443864676)), f64Ty);

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove c3 and c4?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) {
        builder.create<scf::ForOp>(
            loc, c2, ido, c2, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value ic = b.create<arith::SubIOp>(loc, ido, i);
              Value icm1 = b.create<arith::SubIOp>(loc, ic, c1);
              Value im1 = b.create<arith::SubIOp>(loc, i, c1);
              Value im2 = b.create<arith::SubIOp>(loc, i, c2);

              Value wa0im2 = WA(b, loc, wa, c0, im2, ido, c1);
              Value wa0im1 = WA(b, loc, wa, c0, im1, ido, c1);
              Value ccim1k1 = CC(b, loc, cc, im1, k, c1, ido, l1);
              Value ccik1 = CC(b, loc, cc, i, k, c1, ido, l1);
              std::vector<Value> dr2_di2 =
                  MULPM(b, loc, wa0im2, wa0im1, ccim1k1, ccik1);

              Value wa1im2 = WA(b, loc, wa, c1, im2, ido, c1);
              Value wa1im1 = WA(b, loc, wa, c1, im1, ido, c1);
              Value ccim1k2 = CC(b, loc, cc, im1, k, c2, ido, l1);
              Value ccik2 = CC(b, loc, cc, i, k, c2, ido, l1);
              std::vector<Value> dr3_di3 =
                  MULPM(b, loc, wa1im2, wa1im1, ccim1k2, ccik2);

              Value cr2 = b.create<arith::AddFOp>(loc, dr2_di2[0], dr3_di3[0]);
              Value ci2 = b.create<arith::AddFOp>(loc, dr2_di2[1], dr3_di3[1]);

              Value ccim1k0 = CC(b, loc, cc, im1, k, c0, ido, l1);
              Value tmp5 = b.create<arith::AddFOp>(loc, ccim1k0, cr2);
              CH(builder, loc, ch, im1, c0, k, ido, cdim, tmp5);

              Value ccik0 = CC(b, loc, cc, i, k, c0, ido, l1);
              Value tmp6 = b.create<arith::AddFOp>(loc, ccik0, ci2);
              CH(builder, loc, ch, i, c0, k, ido, cdim, tmp6);

              Value tmp7 = b.create<arith::MulFOp>(loc, taur, cr2);
              Value tr2 = b.create<arith::AddFOp>(loc, ccim1k0, tmp7);

              Value tmp8 = b.create<arith::MulFOp>(loc, taur, ci2);
              Value ti2 = b.create<arith::AddFOp>(loc, ccik0, tmp8);

              Value tmp9 = b.create<arith::SubFOp>(loc, dr2_di2[1], dr3_di3[1]);
              Value tr3 = b.create<arith::MulFOp>(loc, taui, tmp9);

              Value tmp10 =
                  b.create<arith::SubFOp>(loc, dr3_di3[0], dr2_di2[0]);
              Value ti3 = b.create<arith::MulFOp>(loc, taui, tmp10);

              std::vector<Value> tr2_tr3 = PM(b, loc, tr2, tr3);
              std::vector<Value> ti3_ti2 = PM(b, loc, ti3, ti2);

              CH(builder, loc, ch, im1, c2, k, ido, cdim, tr2_tr3[0]);
              CH(builder, loc, ch, icm1, c1, k, ido, cdim, tr2_tr3[1]);

              CH(builder, loc, ch, i, c2, k, ido, cdim, ti3_ti2[0]);
              CH(builder, loc, ch, ic, c1, k, ido, cdim, ti3_ti2[1]);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

// Handle radix-3 FFT computation
void radf3(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa,
           Value ido, Value l1) {

  FloatType f64Ty = opBuilder.getF64Type();
  Value cdim = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value taur =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(-0.5)), f64Ty);
  Value taui = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(0.86602540378443864676)), f64Ty);

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove c3 and c4?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);

  Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iv_args) {
        Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1);
        Value cc0k2 = CC(builder, loc, cc, c0, iv, c2, ido, l1);
        Value cr2 = builder.create<arith::AddFOp>(loc, cc0k1, cc0k2);

        Value cc0k0 = CC(builder, loc, cc, c0, iv, c0, ido, l1);
        Value tmp0 = builder.create<arith::AddFOp>(loc, cc0k0, cr2);
        CH(builder, loc, ch, c0, c0, iv, ido, cdim, tmp0);

        Value tmp1 = builder.create<arith::SubFOp>(loc, cc0k2, cc0k1);
        Value tmp2 = builder.create<arith::MulFOp>(loc, tmp1, taui);
        CH(builder, loc, ch, c0, c2, iv, ido, cdim, tmp2);

        Value tmp3 = builder.create<arith::MulFOp>(loc, taur, cr2);
        Value tmp4 = builder.create<arith::AddFOp>(loc, tmp3, cc0k0);
        CH(builder, loc, ch, idom1, c1, iv, ido, cdim, tmp4);

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, ido, c1);
  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        radf3Extend(builder, loc, cc, ch, wa, ido, l1, cdim);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

void radf4Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch,
                 Value wa, Value ido, Value l1, Value cdim, Value c0, Value c1,
                 Value c2, Value c3) {
  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value k, ValueRange kargs) {
        builder.create<scf::ForOp>(
            loc, c2, ido, c2, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) {
              Value ic = b.create<arith::SubIOp>(loc, ido, i);
              Value icm1 = b.create<arith::SubIOp>(loc, ic, c1);
              Value im1 = b.create<arith::SubIOp>(loc, i, c1);
              Value im2 = b.create<arith::SubIOp>(loc, i, c2);

              Value wa0im2 = WA(b, loc, wa, c0, im2, ido, c1);
              Value wa0im1 = WA(b, loc, wa, c0, im1, ido, c1);
              Value ccim1k1 = CC(b, loc, cc, im1, k, c1, ido, l1);
              Value ccik1 = CC(b, loc, cc, i, k, c1, ido, l1);
              std::vector<Value> cr2_ci2 =
                  MULPM(b, loc, wa0im2, wa0im1, ccim1k1, ccik1);

              Value wa1im2 = WA(b, loc, wa, c1, im2, ido, c1);
              Value wa1im1 = WA(b, loc, wa, c1, im1, ido, c1);
              Value ccim1k2 = CC(b, loc, cc, im1, k, c2, ido, l1);
              Value ccik2 = CC(b, loc, cc, i, k, c2, ido, l1);
              std::vector<Value> cr3_ci3 =
                  MULPM(b, loc, wa1im2, wa1im1, ccim1k2, ccik2);

              Value wa2im2 = WA(b, loc, wa, c2, im2, ido, c1);
              Value wa2im1 = WA(b, loc, wa, c2, im1, ido, c1);
              Value ccim1k3 = CC(b, loc, cc, im1, k, c3, ido, l1);
              Value ccik3 = CC(b, loc, cc, i, k, c3, ido, l1);
              std::vector<Value> cr4_ci4 =
                  MULPM(b, loc, wa2im2, wa2im1, ccim1k3, ccik3);

              std::vector<Value> tr1_tr4 = PM(b, loc, cr4_ci4[0], cr2_ci2[0]);
              std::vector<Value> ti1_ti4 = PM(b, loc, cr2_ci2[1], cr4_ci4[1]);
              Value ccim1k0 = CC(b, loc, cc, im1, k, c0, ido, l1);
              std::vector<Value> tr2_tr3 = PM(b, loc, ccim1k0, cr3_ci3[0]);
              Value ccik0 = CC(b, loc, cc, i, k, c0, ido, l1);
              std::vector<Value> ti2_ti3 = PM(b, loc, ccik0, cr3_ci3[1]);

              std::vector<Value> chtmp0 = PM(b, loc, tr2_tr3[0], tr1_tr4[0]);
              CH(b, loc, ch, im1, c0, k, ido, cdim, chtmp0[0]);
              CH(b, loc, ch, icm1, c3, k, ido, cdim, chtmp0[1]);

              std::vector<Value> chtmp1 = PM(b, loc, ti1_ti4[0], ti2_ti3[0]);
              CH(b, loc, ch, i, c0, k, ido, cdim, chtmp1[0]);
              CH(b, loc, ch, ic, c3, k, ido, cdim, chtmp1[1]);

              std::vector<Value> chtmp2 = PM(b, loc, tr2_tr3[1], ti1_ti4[1]);
              CH(b, loc, ch, im1, c2, k, ido, cdim, chtmp2[0]);
              CH(b, loc, ch, icm1, c1, k, ido, cdim, chtmp2[1]);

              std::vector<Value> chtmp3 = PM(b, loc, tr1_tr4[1], ti2_ti3[1]);
              CH(b, loc, ch, i, c2, k, ido, cdim, chtmp3[0]);
              CH(b, loc, ch, ic, c1, k, ido, cdim, chtmp3[1]);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

// Handle radix-4 FFT computation
void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa,
           Value ido, Value l1, Value c0, Value c1, Value c2, Value c3) {
  FloatType f64Ty = opBuilder.getF64Type();
  Value cdim = opBuilder.create<ConstantIndexOp>(loc, 4);
  Value hsqt2 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(0.70710678118654752440)), f64Ty);
  Value idom1 = opBuilder.create<arith::SubIOp>(loc, ido, c1);

  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
        Value cc0k3 = CC(builder, loc, cc, c0, iv, c3, ido, l1);
        Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1);
        std::vector<Value> tr1_tmp0 = PM(builder, loc, cc0k3, cc0k1);
        CH(builder, loc, ch, c0, c2, iv, ido, cdim, tr1_tmp0[1]);

        Value cc0k0 = CC(builder, loc, cc, c0, iv, c0, ido, l1);
        Value cc0k2 = CC(builder, loc, cc, c0, iv, c2, ido, l1);
        std::vector<Value> tr2_tmp1 = PM(builder, loc, cc0k0, cc0k2);
        CH(builder, loc, ch, idom1, c1, iv, ido, cdim, tr2_tmp1[1]);

        std::vector<Value> tmp2_tmp3 =
            PM(builder, loc, tr2_tmp1[0], tr1_tmp0[0]);
        CH(builder, loc, ch, c0, c0, iv, ido, cdim, tmp2_tmp3[0]);
        CH(builder, loc, ch, idom1, c3, iv, ido, cdim, tmp2_tmp3[1]);

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value reminder = opBuilder.create<arith::RemSIOp>(loc, ido, c2);
  Value condition0 = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, reminder, c0);
  opBuilder.create<scf::IfOp>(
      loc, condition0, [&](OpBuilder &builder, Location loc) {
        Value negHsqt2 = builder.create<ConstantFloatOp>(
            loc, APFloat(double(-0.70710678118654752440)), f64Ty);

        builder.create<scf::ForOp>(
            loc, c0, l1, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value iv, ValueRange iargs) {
              Value ccidom1k1 = CC(b, loc, cc, idom1, iv, c1, ido, l1);
              Value ccidom1k3 = CC(b, loc, cc, idom1, iv, c3, ido, l1);
              Value tmp0 = b.create<arith::AddFOp>(loc, ccidom1k1, ccidom1k3);
              Value ti1 = b.create<arith::MulFOp>(loc, negHsqt2, tmp0);

              Value tmp1 = b.create<arith::SubFOp>(loc, ccidom1k1, ccidom1k3);
              Value tr1 = b.create<arith::MulFOp>(loc, hsqt2, tmp1);

              Value ccidom1k0 = CC(b, loc, cc, idom1, iv, c0, ido, l1);
              std::vector<Value> tmp2_tmp3 = PM(b, loc, ccidom1k0, tr1);
              CH(b, loc, ch, idom1, c0, iv, ido, cdim, tmp2_tmp3[0]);
              CH(b, loc, ch, idom1, c2, iv, ido, cdim, tmp2_tmp3[1]);

              Value ccidom1k2 = CC(b, loc, cc, idom1, iv, c2, ido, l1);
              std::vector<Value> tmp4_tmp5 = PM(b, loc, ti1, ccidom1k2);
              CH(b, loc, ch, c0, c3, iv, ido, cdim, tmp4_tmp5[0]);
              CH(b, loc, ch, c0, c1, iv, ido, cdim, tmp4_tmp5[1]);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value condition1 =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, ido, c2);
  opBuilder.create<scf::IfOp>(
      loc, condition1, [&](OpBuilder &builder, Location loc) {
        radf4Extend(builder, loc, cc, ch, wa, ido, l1, cdim, c0, c1, c2, c3);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

void radf5Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch,
                 Value wa, Value ido, Value l1, Value cdim, Value tr11,
                 Value tr12, Value ti11, Value ti12, Value c0, Value c1,
                 Value c2, Value c3, Value c4) {
  opBuilder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value k, ValueRange kargs) {
        builder.create<scf::ForOp>(
            loc, c2, ido, c2, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) {
              Value ic = b.create<arith::SubIOp>(loc, ido, i);
              Value icm1 = b.create<arith::SubIOp>(loc, ic, c1);
              Value im1 = b.create<arith::SubIOp>(loc, i, c1);
              Value im2 = b.create<arith::SubIOp>(loc, i, c2);

              Value wa0im2 = WA(b, loc, wa, c0, im2, ido, c1);
              Value wa0im1 = WA(b, loc, wa, c0, im1, ido, c1);
              Value ccim1k1 = CC(b, loc, cc, im1, k, c1, ido, l1);
              Value ccik1 = CC(b, loc, cc, i, k, c1, ido, l1);
              std::vector<Value> dr2_di2 =
                  MULPM(b, loc, wa0im2, wa0im1, ccim1k1, ccik1);

              Value wa1im2 = WA(b, loc, wa, c1, im2, ido, c1);
              Value wa1im1 = WA(b, loc, wa, c1, im1, ido, c1);
              Value ccim1k2 = CC(b, loc, cc, im1, k, c2, ido, l1);
              Value ccik2 = CC(b, loc, cc, i, k, c2, ido, l1);
              std::vector<Value> dr3_di3 =
                  MULPM(b, loc, wa1im2, wa1im1, ccim1k2, ccik2);

              Value wa2im2 = WA(b, loc, wa, c2, im2, ido, c1);
              Value wa2im1 = WA(b, loc, wa, c2, im1, ido, c1);
              Value ccim1k3 = CC(b, loc, cc, im1, k, c3, ido, l1);
              Value ccik3 = CC(b, loc, cc, i, k, c3, ido, l1);
              std::vector<Value> dr4_di4 =
                  MULPM(b, loc, wa2im2, wa2im1, ccim1k3, ccik3);

              Value wa3im2 = WA(b, loc, wa, c3, im2, ido, c1);
              Value wa3im1 = WA(b, loc, wa, c3, im1, ido, c1);
              Value ccim1k4 = CC(b, loc, cc, im1, k, c4, ido, l1);
              Value ccik4 = CC(b, loc, cc, i, k, c4, ido, l1);
              std::vector<Value> dr5_di5 =
                  MULPM(b, loc, wa3im2, wa3im1, ccim1k4, ccik4);

              std::vector<Value> cr2_ci5 = PM(b, loc, dr5_di5[0], dr2_di2[0]);
              std::vector<Value> ci2_cr5 = PM(b, loc, dr2_di2[1], dr5_di5[1]);
              std::vector<Value> cr3_ci4 = PM(b, loc, dr4_di4[0], dr3_di3[0]);
              std::vector<Value> ci3_cr4 = PM(b, loc, dr3_di3[1], dr4_di4[1]);

              Value ccim1k0 = CC(b, loc, cc, im1, k, c0, ido, l1);
              Value tmpch0 = b.create<arith::AddFOp>(loc, ccim1k0, cr2_ci5[0]);
              Value chim10k = b.create<arith::AddFOp>(loc, tmpch0, cr3_ci4[0]);
              CH(b, loc, ch, im1, c0, k, ido, cdim, chim10k);

              Value ccik0 = CC(b, loc, cc, i, k, c0, ido, l1);
              Value tmpch1 = b.create<arith::AddFOp>(loc, ccik0, ci2_cr5[0]);
              Value chi0k = b.create<arith::AddFOp>(loc, tmpch1, ci3_cr4[0]);
              CH(b, loc, ch, i, c0, k, ido, cdim, chi0k);

              Value tmp0 = b.create<arith::MulFOp>(loc, tr11, cr2_ci5[0]);
              Value tmp1 = b.create<arith::AddFOp>(loc, ccim1k0, tmp0);
              Value tmp2 = b.create<arith::MulFOp>(loc, tr12, cr3_ci4[0]);
              Value tr2 = b.create<arith::AddFOp>(loc, tmp1, tmp2);

              Value tmp3 = b.create<arith::MulFOp>(loc, tr11, ci2_cr5[0]);
              Value tmp4 = b.create<arith::AddFOp>(loc, ccik0, tmp3);
              Value tmp5 = b.create<arith::MulFOp>(loc, tr12, ci3_cr4[0]);
              Value ti2 = b.create<arith::AddFOp>(loc, tmp4, tmp5);

              Value tmp6 = b.create<arith::MulFOp>(loc, tr12, cr2_ci5[0]);
              Value tmp7 = b.create<arith::AddFOp>(loc, ccim1k0, tmp6);
              Value tmp8 = b.create<arith::MulFOp>(loc, tr11, cr3_ci4[0]);
              Value tr3 = b.create<arith::AddFOp>(loc, tmp7, tmp8);

              Value tmp9 = b.create<arith::MulFOp>(loc, tr12, ci2_cr5[0]);
              Value tmp10 = b.create<arith::AddFOp>(loc, ccik0, tmp9);
              Value tmp11 = b.create<arith::MulFOp>(loc, tr11, ci3_cr4[0]);
              Value ti3 = b.create<arith::AddFOp>(loc, tmp10, tmp11);

              std::vector<Value> tr5_tr4 =
                  MULPM(b, loc, ci2_cr5[1], ci3_cr4[1], ti11, ti12);
              std::vector<Value> ti5_ti4 =
                  MULPM(b, loc, cr2_ci5[1], cr3_ci4[1], ti11, ti12);

              std::vector<Value> chtmp0 = PM(b, loc, tr2, tr5_tr4[0]);
              CH(b, loc, ch, im1, c2, k, ido, cdim, chtmp0[0]);
              CH(b, loc, ch, icm1, c1, k, ido, cdim, chtmp0[1]);

              std::vector<Value> chtmp1 = PM(b, loc, ti5_ti4[0], ti2);
              CH(b, loc, ch, i, c2, k, ido, cdim, chtmp1[0]);
              CH(b, loc, ch, ic, c1, k, ido, cdim, chtmp1[1]);

              std::vector<Value> chtmp2 = PM(b, loc, tr3, tr5_tr4[1]);
              CH(b, loc, ch, im1, c4, k, ido, cdim, chtmp2[0]);
              CH(b, loc, ch, icm1, c3, k, ido, cdim, chtmp2[1]);

              std::vector<Value> chtmp3 = PM(b, loc, ti5_ti4[1], ti3);
              CH(b, loc, ch, i, c4, k, ido, cdim, chtmp3[0]);
              CH(b, loc, ch, ic, c3, k, ido, cdim, chtmp3[1]);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

// Handle radix-5 FFT computation
void radf5(OpBuilder &builder, Location loc, Value cc, Value ch, Value wa,
           Value ido, Value l1, Value c0, Value c1, Value c2, Value c3,
           Value c4) {

  FloatType f64Ty = builder.getF64Type();
  Value cdim = builder.create<ConstantIndexOp>(loc, 5);
  Value tr11 = builder.create<ConstantFloatOp>(
      loc, APFloat(double(0.3090169943749474241)), f64Ty);
  Value tr12 = builder.create<ConstantFloatOp>(
      loc, APFloat(double(-0.8090169943749474241)), f64Ty);
  Value ti11 = builder.create<ConstantFloatOp>(
      loc, APFloat(double(0.95105651629515357212)), f64Ty);
  Value ti12 = builder.create<ConstantFloatOp>(
      loc, APFloat(double(0.58778525229247312917)), f64Ty);
  Value idom1 = builder.create<arith::SubIOp>(loc, ido, c1);

  builder.create<scf::ForOp>(
      loc, c0, l1, c1, std::nullopt,
      [&](OpBuilder &b, Location loc, Value iv, ValueRange iargs) {
        Value cc0k4 = CC(b, loc, cc, c0, iv, c4, ido, l1);
        Value cc0k1 = CC(b, loc, cc, c0, iv, c1, ido, l1);
        std::vector<Value> cr2_ci5 = PM(b, loc, cc0k4, cc0k1);

        Value cc0k3 = CC(b, loc, cc, c0, iv, c3, ido, l1);
        Value cc0k2 = CC(b, loc, cc, c0, iv, c2, ido, l1);
        std::vector<Value> cr3_ci4 = PM(b, loc, cc0k3, cc0k2);

        Value cc0k0 = CC(b, loc, cc, c0, iv, c0, ido, l1);
        Value tmpch0 = b.create<arith::AddFOp>(loc, cc0k0, cr2_ci5[0]);
        Value ch0 = b.create<arith::AddFOp>(loc, tmpch0, cr3_ci4[0]);
        CH(b, loc, ch, c0, c0, iv, ido, cdim, ch0);

        Value tmpch1 = b.create<arith::MulFOp>(loc, tr11, cr2_ci5[0]);
        Value tmpch2 = b.create<arith::MulFOp>(loc, tr12, cr3_ci4[0]);
        Value tmpch3 = b.create<arith::AddFOp>(loc, cc0k0, tmpch1);
        Value ch1 = b.create<arith::AddFOp>(loc, tmpch2, tmpch3);
        CH(b, loc, ch, idom1, c1, iv, ido, cdim, ch1);

        Value tmpch4 = b.create<arith::MulFOp>(loc, ti11, cr2_ci5[1]);
        Value tmpch5 = b.create<arith::MulFOp>(loc, ti12, cr3_ci4[1]);
        Value ch2 = b.create<arith::AddFOp>(loc, tmpch4, tmpch5);
        CH(b, loc, ch, c0, c2, iv, ido, cdim, ch2);

        Value tmpch6 = b.create<arith::MulFOp>(loc, tr12, cr2_ci5[0]);
        Value tmpch7 = b.create<arith::MulFOp>(loc, tr11, cr3_ci4[0]);
        Value tmpch8 = b.create<arith::AddFOp>(loc, tmpch6, tmpch7);
        Value ch3 = b.create<arith::AddFOp>(loc, cc0k0, tmpch8);
        CH(b, loc, ch, idom1, c3, iv, ido, cdim, ch3);

        Value tmpch9 = b.create<arith::MulFOp>(loc, ti12, cr2_ci5[1]);
        Value tmpch10 = b.create<arith::MulFOp>(loc, ti11, cr3_ci4[1]);
        Value ch4 = b.create<arith::SubFOp>(loc, tmpch9, tmpch10);
        CH(b, loc, ch, c0, c4, iv, ido, cdim, ch4);

        b.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value condition =
      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, ido, c1);
  builder.create<scf::IfOp>(loc, condition, [&](OpBuilder &b, Location loc) {
    radf5Extend(b, loc, cc, ch, wa, ido, l1, cdim, tr11, tr12, ti11, ti12, c0,
                c1, c2, c3, c4);
    b.create<scf::YieldOp>(loc, std::nullopt);
  });

  return;
}

// function to implement ++ operation
void index_increment(OpBuilder &opBuilder, Location loc, Value target) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value a = opBuilder.create<memref::LoadOp>(loc, target, c0);
  Value b = opBuilder.create<arith::AddIOp>(loc, a, c1);
  opBuilder.create<memref::StoreOp>(loc, b, target, c0);
}

// switch 2 element in an array
void index_SWAP(OpBuilder &opBuilder, Location loc, Value array, Value target1,
                Value target2) {
  Value a = opBuilder.create<memref::LoadOp>(loc, array, target1);
  Value b = opBuilder.create<memref::LoadOp>(loc, array, target2);

  opBuilder.create<memref::StoreOp>(loc, a, array, target2);
  opBuilder.create<memref::StoreOp>(loc, b, array, target1);
}

// factorize the input length ans store factors in Rfftp_fctdata_fct
Value rfftp_factorize(OpBuilder &opBuilder, Location loc,
                      Value Rfftp_fctdata_fct, Value Rfftp_fctdata_tw,
                      Value Rfftp_fctdata_tws, Value Rfftp_plan_length,
                      Value Rfftp_plan_nfct, Value Rfftp_plan_mem) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // TODO: remove the following values?
  // Value c_neg1 = opBuilder.create<ConstantIndexOp>(loc, -1);
  // Value NFCT = opBuilder.create<ConstantIndexOp>(loc, 25);

  // FloatType f64Ty = opBuilder.getF64Type();
  IndexType indexTy = opBuilder.getIndexType();

  Value length =
      opBuilder.create<memref::AllocOp>(loc, MemRefType::get(1, indexTy));
  Value length_1 = opBuilder.create<memref::LoadOp>(loc, Rfftp_plan_length, c0);
  opBuilder.create<memref::StoreOp>(loc, length_1, length, c0);

  Value nfct =
      opBuilder.create<memref::AllocOp>(loc, MemRefType::get(1, indexTy));

  opBuilder.create<memref::StoreOp>(loc, c0, nfct, c0);

  opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy}, ValueRange{length_1},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value length_while = args[0];

        Value length_mod_4 =
            builder.create<arith::RemSIOp>(loc, length_while, c4);
        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::eq, length_mod_4, c0);
        builder.create<scf::ConditionOp>(loc, condition,
                                         ValueRange{length_while});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value length_while = args[0];

        Value currnet_nfct = builder.create<memref::LoadOp>(loc, nfct, c0);
        builder.create<memref::StoreOp>(loc, c4, Rfftp_fctdata_fct,
                                        currnet_nfct);
        index_increment(builder, loc, nfct);
        Value length_next =
            builder.create<arith::ShRSIOp>(loc, length_while, c2);
        builder.create<memref::StoreOp>(loc, length_next, length, c0);

        builder.create<scf::YieldOp>(loc, std::vector<Value>{length_next});
      });

  Value length_if = opBuilder.create<memref::LoadOp>(loc, length, c0);
  Value length_if_mod_2 = opBuilder.create<arith::RemSIOp>(loc, length_if, c2);
  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, length_if_mod_2, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        Value length_next = builder.create<arith::ShRSIOp>(loc, length_if, c1);
        builder.create<memref::StoreOp>(loc, length_next, length, c0);

        Value currnet_nfct = builder.create<memref::LoadOp>(loc, nfct, c0);
        builder.create<memref::StoreOp>(loc, c2, Rfftp_fctdata_fct,
                                        currnet_nfct);
        index_increment(builder, loc, nfct);

        Value currnet_nfct_1 = builder.create<memref::LoadOp>(loc, nfct, c0);
        Value nfctm1 = builder.create<arith::SubIOp>(loc, currnet_nfct_1, c1);
        index_SWAP(builder, loc, Rfftp_fctdata_fct, nfctm1, c0);

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  // TODO: remove type1 and type2?
  // TypeRange type1 = TypeRange{f64Ty};
  // TypeRange type2 = TypeRange{indexTy};

  Value maxl =
      opBuilder.create<memref::AllocOp>(loc, MemRefType::get(1, indexTy));
  Value current_length2 = opBuilder.create<memref::LoadOp>(loc, length, c0);
  Value current_length2_i32 = opBuilder.create<arith::IndexCastOp>(
      loc, opBuilder.getI32Type(), current_length2);
  Value length_f64 = opBuilder.create<arith::SIToFPOp>(
      loc, opBuilder.getF64Type(), current_length2_i32);
  Value sqrt_length = opBuilder.create<math::SqrtOp>(loc, length_f64);
  Value maxl_index = opBuilder.create<arith::FPToSIOp>(
      loc, opBuilder.getI32Type(), sqrt_length);
  Value maxl_index_index = opBuilder.create<arith::IndexCastOp>(
      loc, opBuilder.getIndexType(), maxl_index);
  Value maxl_final = opBuilder.create<arith::AddIOp>(loc, maxl_index_index, c1);
  opBuilder.create<memref::StoreOp>(loc, maxl_final, maxl, c0);

  opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy}, ValueRange{c3},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value divisor = args[0];
        Value length_while = builder.create<memref::LoadOp>(loc, length, c0);
        Value current_maxl = builder.create<memref::LoadOp>(loc, maxl, c0);

        Value condition1 = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sgt, length_while, c1);
        Value condition2 = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::slt, divisor, current_maxl);
        Value and_cond =
            builder.create<arith::AndIOp>(loc, condition1, condition2);
        builder.create<scf::ConditionOp>(loc, and_cond, ValueRange{divisor});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value divisor = args[0];

        Value length_while = builder.create<memref::LoadOp>(loc, length, c0);
        Value length_mod_divisor =
            builder.create<arith::RemSIOp>(loc, length_while, divisor);
        Value condition1 = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::eq, length_mod_divisor, c0);
        builder.create<scf::IfOp>(
            loc, condition1, [&](OpBuilder &b, Location loc) {
              b.create<scf::WhileOp>(
                  loc, TypeRange{indexTy}, ValueRange{c1},
                  [&](OpBuilder &b2, Location loc, ValueRange args) {
                    Value x = args[0];

                    Value length_while_1 =
                        b2.create<memref::LoadOp>(loc, length, c0);
                    Value length_mod_divisor_1 =
                        b2.create<arith::RemSIOp>(loc, length_while_1, divisor);

                    Value condition2 =
                        b2.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
                                                 length_mod_divisor_1, c0);
                    b2.create<scf::ConditionOp>(loc, condition2, ValueRange{x});
                  },
                  [&](OpBuilder &b2, Location loc, ValueRange args) {
                    Value x = args[0];

                    Value currnet_nfct =
                        b2.create<memref::LoadOp>(loc, nfct, c0);
                    b2.create<memref::StoreOp>(loc, divisor, Rfftp_fctdata_fct,
                                               currnet_nfct);
                    index_increment(b2, loc, nfct);

                    Value length_while_1 =
                        b2.create<memref::LoadOp>(loc, length, c0);
                    Value length_new =
                        b2.create<arith::DivSIOp>(loc, length_while_1, divisor);
                    b2.create<memref::StoreOp>(loc, length_new, length, c0);

                    b2.create<scf::YieldOp>(loc, std::vector<Value>{x});
                  });

              Value current_length2_1 =
                  b.create<memref::LoadOp>(loc, length, c0);
              Value currnet_length2_i32_1 = b.create<arith::IndexCastOp>(
                  loc, opBuilder.getI32Type(), current_length2_1);
              Value length_f64_1 = b.create<arith::SIToFPOp>(
                  loc, opBuilder.getF64Type(), currnet_length2_i32_1);
              Value sqrt_length_1 = b.create<math::SqrtOp>(loc, length_f64_1);
              Value maxl_index_1 =
                  b.create<arith::FPToSIOp>(loc, b.getI32Type(), sqrt_length_1);
              Value maxl_index_index_1 = b.create<arith::IndexCastOp>(
                  loc, opBuilder.getIndexType(), maxl_index_1);
              Value maxl_final_1 =
                  b.create<arith::AddIOp>(loc, maxl_index_index_1, c1);
              b.create<memref::StoreOp>(loc, maxl_final_1, maxl, c0);

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value divisor_next = builder.create<arith::AddIOp>(loc, divisor, c2);
        builder.create<scf::YieldOp>(loc, std::vector<Value>{divisor_next});
      });

  Value current_length1 = opBuilder.create<memref::LoadOp>(loc, length, c0);
  Value condition1 = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::sgt, current_length1, c1);
  opBuilder.create<scf::IfOp>(
      loc, condition1, [&](OpBuilder &builder, Location loc) {
        Value current_nfct = builder.create<memref::LoadOp>(loc, nfct, c0);
        builder.create<memref::StoreOp>(loc, current_length1, Rfftp_fctdata_fct,
                                        current_nfct);
        index_increment(builder, loc, nfct);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value current_nfct1 = opBuilder.create<memref::LoadOp>(loc, nfct, c0);
  opBuilder.create<memref::StoreOp>(loc, current_nfct1, Rfftp_plan_nfct, c0);

  return c0;
}

Value index_to_f64(OpBuilder &opBuilder, Location loc, Value n) {
  // TODO: remove the following values?
  // TypeRange type = TypeRange{opBuilder.getF64Type()};
  Value n_i32 =
      opBuilder.create<arith::IndexCastOp>(loc, opBuilder.getI32Type(), n);
  Value n_f64 =
      opBuilder.create<arith::SIToFPOp>(loc, opBuilder.getF64Type(), n_i32);
  return n_f64;
}

Value f64_to_index(OpBuilder &opBuilder, Location loc, Value n_f64) {
  // TODO: remove type?
  // TypeRange type = TypeRange{opBuilder.getI32Type()};
  Value n_i32 =
      opBuilder.create<arith::FPToSIOp>(loc, opBuilder.getI32Type(), n_f64);
  Value n_index = opBuilder.create<arith::IndexCastOp>(
      loc, opBuilder.getIndexType(), n_i32);
  return n_index;
}

void my_sincosm1pi(OpBuilder &opBuilder, Location loc, Value a, Value res,
                   Value bias) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);

  FloatType f64Ty = opBuilder.getF64Type();

  FailureOr<StridedLayoutAttr> computelayout = StridedLayoutAttr::get(
      opBuilder.getContext(),
      /*offset=*/ShapedType::kDynamic, /*strides=*/{ShapedType::kDynamic});
  MemRefType resultType =
      MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

  // memref<?xf64, strided<[?], offset: ?>>

  Value res_raw = opBuilder.create<memref::SubViewOp>(
      loc, resultType, res, SmallVector<OpFoldResult>{bias},
      SmallVector<OpFoldResult>{c2}, SmallVector<OpFoldResult>{c1});

  Value s = opBuilder.create<arith::MulFOp>(loc, a, a);

  Value r1 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-1.0369917389758117e-4)), f64Ty);
  Value r2 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(1.9294935641298806e-3)), f64Ty);
  Value r3 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-2.5806887942825395e-2)), f64Ty);
  Value r4 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(2.3533063028328211e-1)), f64Ty);
  Value r5 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-1.3352627688538006e+0)), f64Ty);
  Value r6 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(4.0587121264167623e+0)), f64Ty);
  Value r7 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-4.9348022005446790e+0)), f64Ty);

  Value fma1 = opBuilder.create<math::FmaOp>(loc, r1, s, r2);
  Value fma2 = opBuilder.create<math::FmaOp>(loc, fma1, s, r3);
  Value fma3 = opBuilder.create<math::FmaOp>(loc, fma2, s, r4);
  Value fma4 = opBuilder.create<math::FmaOp>(loc, fma3, s, r5);
  Value fma5 = opBuilder.create<math::FmaOp>(loc, fma4, s, r6);
  Value fma6 = opBuilder.create<math::FmaOp>(loc, fma5, s, r7);

  Value c = opBuilder.create<arith::MulFOp>(loc, fma6, s);

  Value r8 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(4.6151442520157035e-4)), f64Ty);
  Value r9 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-7.3700183130883555e-3)), f64Ty);
  Value r10 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(8.2145868949323936e-2)), f64Ty);
  Value r11 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-5.9926452893214921e-1)), f64Ty);
  Value r12 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(2.5501640398732688e+0)), f64Ty);
  Value r13 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(-5.1677127800499516e+0)), f64Ty);

  Value fma7 = opBuilder.create<math::FmaOp>(loc, r8, s, r9);
  Value fma8 = opBuilder.create<math::FmaOp>(loc, fma7, s, r10);
  Value fma9 = opBuilder.create<math::FmaOp>(loc, fma8, s, r11);
  Value fma10 = opBuilder.create<math::FmaOp>(loc, fma9, s, r12);
  Value fma11 = opBuilder.create<math::FmaOp>(loc, fma10, s, r13);

  Value s_new = opBuilder.create<arith::MulFOp>(loc, s, a);
  Value r = opBuilder.create<arith::MulFOp>(loc, fma11, s_new);

  Value pi = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(3.1415926535897931e+0)), f64Ty);
  Value s_final = opBuilder.create<math::FmaOp>(loc, a, pi, r);

  opBuilder.create<memref::StoreOp>(loc, c, res_raw, c0);
  opBuilder.create<memref::StoreOp>(loc, s_final, res_raw, c1);

  return;
}

void calc_first_octant_extend2(OpBuilder &opBuilder, Location loc, Value den,
                               Value res, Value bias) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // TODO: remove c5 and c50?
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);
  // Value c50 = opBuilder.create<ConstantIndexOp>(loc, 50);

  Value den_plus_4 = opBuilder.create<arith::AddIOp>(loc, den, c4);
  Value n = opBuilder.create<arith::ShRUIOp>(loc, den_plus_4, c3);

  Value size = opBuilder.create<memref::DimOp>(loc, res, c0);
  Value remaining_size = opBuilder.create<arith::SubIOp>(loc, size, bias);

  FloatType f64Ty = opBuilder.getF64Type();

  FailureOr<StridedLayoutAttr> computelayout = StridedLayoutAttr::get(
      opBuilder.getContext(),
      /*offset=*/ShapedType::kDynamic, /*strides=*/{ShapedType::kDynamic});
  MemRefType resultType =
      MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

  // memref<?xf64, strided<[?], offset: ?>>

  Value res_raw = opBuilder.create<memref::SubViewOp>(
      loc, resultType, res, SmallVector<OpFoldResult>{bias},
      SmallVector<OpFoldResult>{remaining_size}, SmallVector<OpFoldResult>{c1});

  Value f2 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(2.0)), f64Ty);
  Value f1 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);
  // TODO: remove f0?
  // Value f0 =
  //     opBuilder.create<ConstantFloatOp>(loc, APFloat(double(0.0)), f64Ty);

  Value n_f64 = index_to_f64(opBuilder, loc, n);
  Value l1_f64 = opBuilder.create<math::SqrtOp>(loc, n_f64);
  Value l1 = f64_to_index(opBuilder, loc, l1_f64);

  opBuilder.create<scf::ForOp>(
      loc, c1, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs) {
        Value i_f64 = index_to_f64(builder, loc, i);
        Value den_f64 = index_to_f64(builder, loc, den);
        Value arg = builder.create<arith::DivFOp>(loc, i_f64, den_f64);
        Value arg_scaled = builder.create<arith::MulFOp>(loc, arg, f2);

        Value im2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value im2_bias = builder.create<arith::AddIOp>(loc, im2, bias);

        my_sincosm1pi(builder, loc, arg_scaled, res, im2_bias);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value start_start = opBuilder.create<arith::AddIOp>(loc, l1, c0);

  opBuilder.create<scf::ForOp>(
      loc, start_start, n, l1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value start_loop,
          ValueRange start_loop_args) {
        Value start_f64 = index_to_f64(builder, loc, start_loop);
        Value den_f64 = index_to_f64(builder, loc, den);
        Value arg = builder.create<arith::DivFOp>(loc, start_f64, den_f64);
        Value arg_scaled = builder.create<arith::MulFOp>(loc, arg, f2);

        Value cs =
            builder.create<memref::AllocOp>(loc, MemRefType::get(2, f64Ty));
        my_sincosm1pi(builder, loc, arg_scaled, cs, c0);

        Value cs0 = builder.create<memref::LoadOp>(loc, cs, c0);
        Value cs1 = builder.create<memref::LoadOp>(loc, cs, c1);

        Value cs0_plus_1 = builder.create<arith::AddFOp>(loc, cs0, f1);

        Value start_2 = builder.create<arith::MulIOp>(loc, start_loop, c2);
        builder.create<memref::StoreOp>(loc, cs0_plus_1, res_raw, start_2);
        Value start_2_plus_1 = builder.create<arith::AddIOp>(loc, start_2, c1);
        builder.create<memref::StoreOp>(loc, cs1, res_raw, start_2_plus_1);

        Value n_minus_start = builder.create<arith::SubIOp>(loc, n, start_loop);
        Value end_1 = builder.create<arith::AddIOp>(loc, l1, c0);
        Value sum = builder.create<arith::AddIOp>(loc, start_loop, end_1);
        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sgt, sum, n);
        Value end = builder.create<arith::SelectOp>(loc, condition,
                                                    n_minus_start, end_1);

        builder.create<scf::ForOp>(
            loc, c1, end, c1, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value i_2 = b.create<arith::MulIOp>(loc, i, c2);
              Value csx0 = b.create<memref::LoadOp>(loc, res_raw, i_2);
              Value i_2_plus_1 = b.create<arith::AddIOp>(loc, i_2, c1);
              Value csx1 = b.create<memref::LoadOp>(loc, res_raw, i_2_plus_1);

              Value tmp1 = b.create<arith::MulFOp>(loc, cs0, csx0);
              Value tmp2 = b.create<arith::MulFOp>(loc, cs1, csx1);
              Value tmp3 = b.create<arith::SubFOp>(loc, tmp1, tmp2);
              Value tmp4 = b.create<arith::AddFOp>(loc, tmp3, cs0);
              Value tmp5 = b.create<arith::AddFOp>(loc, tmp4, csx0);
              Value res_real = b.create<arith::AddFOp>(loc, tmp5, f1);

              Value tmp6 = b.create<arith::MulFOp>(loc, cs0, csx1);
              Value tmp7 = b.create<arith::MulFOp>(loc, cs1, csx0);
              Value tmp8 = b.create<arith::AddFOp>(loc, tmp6, tmp7);
              Value tmp9 = b.create<arith::AddFOp>(loc, tmp8, cs1);
              Value res_imag = b.create<arith::AddFOp>(loc, tmp9, csx1);

              Value start_plus_i = b.create<arith::AddIOp>(loc, start_loop, i);
              Value start_plus_i_2 =
                  b.create<arith::MulIOp>(loc, start_plus_i, c2);
              Value start_plus_i_2_plus_1 =
                  b.create<arith::AddIOp>(loc, start_plus_i_2, c1);
              b.create<memref::StoreOp>(loc, res_real, res_raw, start_plus_i_2);
              b.create<memref::StoreOp>(loc, res_imag, res_raw,
                                        start_plus_i_2_plus_1);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        builder.create<memref::DeallocOp>(loc, cs);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  opBuilder.create<scf::ForOp>(
      loc, c1, l1, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value i, ValueRange i_args) {
        Value i_2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value val = builder.create<memref::LoadOp>(loc, res_raw, i_2);
        Value val_plus_1 = builder.create<arith::AddFOp>(loc, val, f1);
        builder.create<memref::StoreOp>(loc, val_plus_1, res_raw, i_2);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

void calc_first_octant_extend1(OpBuilder &opBuilder, Location loc, Value den,
                               Value res, Value bias) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  // TODO: remove c2 and c5?
  // Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  Value den_plus_4 = opBuilder.create<arith::AddIOp>(loc, den, c4);
  Value n = opBuilder.create<arith::ShRUIOp>(loc, den_plus_4, c3);

  Value size = opBuilder.create<memref::DimOp>(loc, res, c0);
  Value remaining_size = opBuilder.create<arith::SubIOp>(loc, size, bias);

  FloatType f64Ty = opBuilder.getF64Type();

  FailureOr<StridedLayoutAttr> computelayout = StridedLayoutAttr::get(
      opBuilder.getContext(),
      /*offset=*/ShapedType::kDynamic, /*strides=*/{ShapedType::kDynamic});
  MemRefType resultType =
      MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

  // memref<?xf64, strided<[?], offset: ?>>

  Value res_raw = opBuilder.create<memref::SubViewOp>(
      loc, resultType, res, SmallVector<OpFoldResult>{bias},
      SmallVector<OpFoldResult>{remaining_size}, SmallVector<OpFoldResult>{c1});

  Value f1 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);
  Value f0 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(0.0)), f64Ty);

  opBuilder.create<memref::StoreOp>(loc, f1, res_raw, c0);
  opBuilder.create<memref::StoreOp>(loc, f0, res_raw, c1);

  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, n, c1);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        calc_first_octant_extend2(builder, loc, den, res, bias);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

void calc_first_octant(OpBuilder &opBuilder, Location loc, Value den, Value res,
                       Value bias) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  // TODO: remove c1, c2, and c5?
  // Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  // Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  Value den_plus_4 = opBuilder.create<arith::AddIOp>(loc, den, c4);
  Value n = opBuilder.create<arith::ShRUIOp>(loc, den_plus_4, c3);

  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, n, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        calc_first_octant_extend1(builder, loc, den, res, bias);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

void calc_first_quadrant(OpBuilder &opBuilder, Location loc, Value n,
                         Value res) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // TODO: remove c4 and c5?
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  Value size = opBuilder.create<memref::DimOp>(loc, res, c0);
  Value remaining_size = opBuilder.create<arith::SubIOp>(loc, size, n);

  FloatType f64Ty = opBuilder.getF64Type();

  FailureOr<StridedLayoutAttr> computelayout = StridedLayoutAttr::get(
      opBuilder.getContext(),
      /*offset=*/ShapedType::kDynamic, /*strides=*/{ShapedType::kDynamic});
  MemRefType resultType =
      MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

  // memref<?xf64, strided<[?], offset: ?>>

  Value p_raw = opBuilder.create<memref::SubViewOp>(
      loc, resultType, res, SmallVector<OpFoldResult>{n},
      SmallVector<OpFoldResult>{remaining_size}, SmallVector<OpFoldResult>{c1});

  Value n_times_2 = opBuilder.create<arith::ShLIOp>(loc, n, c1);
  calc_first_octant(opBuilder, loc, n_times_2, res, n);

  Value n_plus_2 = opBuilder.create<arith::AddIOp>(loc, n, c2);
  Value ndone = opBuilder.create<arith::ShRUIOp>(loc, n_plus_2, c2);
  Value ndonem1 = opBuilder.create<arith::SubIOp>(loc, ndone, c1);
  Value ndone2 = opBuilder.create<arith::MulIOp>(loc, ndone, c2);
  Value idx2_start = opBuilder.create<arith::SubIOp>(loc, ndone2, c2);

  Value i_start = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value idx1_start = opBuilder.create<ConstantIndexOp>(loc, 0);

  auto loop = opBuilder.create<scf::ForOp>(
      loc, i_start, ndonem1, c2, ValueRange{i_start, idx1_start, idx2_start},
      [&](OpBuilder &builder, Location loc, Value i_loop,
          ValueRange i_loop_args) {
        Value i_loop1 = i_loop_args[0];
        Value idx1 = i_loop_args[1];
        Value idx2 = i_loop_args[2];

        Value p_2i = builder.create<arith::MulIOp>(loc, i_loop1, c2);
        Value p_val = builder.create<memref::LoadOp>(loc, p_raw, p_2i);
        builder.create<memref::StoreOp>(loc, p_val, res, idx1);

        Value p_2i_plus_1 = builder.create<arith::AddIOp>(loc, p_2i, c1);
        Value p_val_1 = builder.create<memref::LoadOp>(loc, p_raw, p_2i_plus_1);
        Value idx1_plus_1 = builder.create<arith::AddIOp>(loc, idx1, c1);
        builder.create<memref::StoreOp>(loc, p_val_1, res, idx1_plus_1);

        Value p_2i_plus_3 = builder.create<arith::AddIOp>(loc, p_2i, c3);
        Value p_val_3 = builder.create<memref::LoadOp>(loc, p_raw, p_2i_plus_3);
        builder.create<memref::StoreOp>(loc, p_val_3, res, idx2);

        Value p_2i_plus_2 = builder.create<arith::AddIOp>(loc, p_2i, c2);
        Value p_val_2 = builder.create<memref::LoadOp>(loc, p_raw, p_2i_plus_2);
        Value idx2_plus_1 = builder.create<arith::AddIOp>(loc, idx2, c1);
        builder.create<memref::StoreOp>(loc, p_val_2, res, idx2_plus_1);

        Value i_loop1_next = builder.create<arith::AddIOp>(loc, i_loop1, c2);
        Value idx1_next = builder.create<arith::AddIOp>(loc, idx1, c2);
        Value idx2_next = builder.create<arith::SubIOp>(loc, idx2, c2);
        builder.create<scf::YieldOp>(
            loc, std::vector<Value>{i_loop1_next, idx1_next, idx2_next});
      });

  Value i_v = loop.getResults()[0];
  Value idx1_v = loop.getResults()[1];
  // TODO: remove idx2_v?
  // Value idx2_v = loop.getResults()[2];

  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::ne, i_v, ndone);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        Value p_2i = builder.create<arith::MulIOp>(loc, i_v, c2);
        Value p_val = builder.create<memref::LoadOp>(loc, p_raw, p_2i);
        builder.create<memref::StoreOp>(loc, p_val, res, idx1_v);

        Value p_2i_plus_1 = builder.create<arith::AddIOp>(loc, p_2i, c1);
        Value p_val_1 = builder.create<memref::LoadOp>(loc, p_raw, p_2i_plus_1);
        Value idx1_plus_1 = builder.create<arith::AddIOp>(loc, idx1_v, c1);
        builder.create<memref::StoreOp>(loc, p_val_1, res, idx1_plus_1);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

void calc_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // TODO: remove c5?
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  IndexType indexTy = opBuilder.getIndexType();
  FloatType f64Ty = opBuilder.getF64Type();

  Value f0 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(0.0)), f64Ty);
  // TODO: remove f1?
  // Value f1 =
  //     opBuilder.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);

  Value n_plus_1 = opBuilder.create<arith::AddIOp>(loc, n, c1);
  Value ndone = opBuilder.create<arith::ShRUIOp>(loc, n_plus_1, c1);

  Value size = opBuilder.create<memref::DimOp>(loc, res, c0);
  Value remaining_size = opBuilder.create<arith::SubIOp>(loc, size, n);
  Value remaining_size_p1 =
      opBuilder.create<arith::AddIOp>(loc, remaining_size, c1);

  Value nm1 = opBuilder.create<arith::SubIOp>(loc, n, c1);

  FailureOr<StridedLayoutAttr> computelayout = StridedLayoutAttr::get(
      opBuilder.getContext(),
      /*offset=*/ShapedType::kDynamic, /*strides=*/{ShapedType::kDynamic});
  MemRefType resultType =
      MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

  // memref<?xf64, strided<[?], offset: ?>>

  Value p_raw = opBuilder.create<memref::SubViewOp>(
      loc, resultType, res, SmallVector<OpFoldResult>{nm1},
      SmallVector<OpFoldResult>{remaining_size_p1},
      SmallVector<OpFoldResult>{c1});

  Value n_times_4 = opBuilder.create<arith::ShLIOp>(loc, n, c2);
  calc_first_octant(opBuilder, loc, n_times_4, res, nm1);

  Value i4_start = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value i_start = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value in = opBuilder.create<arith::AddIOp>(loc, n, c0);

  auto loop = opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy, indexTy}, ValueRange{i4_start, i_start},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value in_minus_i4 = builder.create<arith::SubIOp>(loc, in, i4);
        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sle, i4, in_minus_i4);
        builder.create<scf::ConditionOp>(loc, condition, ValueRange{i4, i});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value i4_2 = builder.create<arith::MulIOp>(loc, i4, c2);
        Value i_2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value i4_2_p1 = builder.create<arith::AddIOp>(loc, i4_2, c1);
        Value i_2_p1 = builder.create<arith::AddIOp>(loc, i_2, c1);

        Value p_i4_2 = builder.create<memref::LoadOp>(loc, p_raw, i4_2);
        Value p_i4_2_p1 = builder.create<memref::LoadOp>(loc, p_raw, i4_2_p1);

        builder.create<memref::StoreOp>(loc, p_i4_2, res, i_2);
        builder.create<memref::StoreOp>(loc, p_i4_2_p1, res, i_2_p1);

        Value i4_next = builder.create<arith::AddIOp>(loc, i4, c4);
        Value i_next = builder.create<arith::AddIOp>(loc, i, c1);
        builder.create<scf::YieldOp>(loc, std::vector<Value>{i4_next, i_next});
      });

  Value final_i4_0 = loop.getResults()[0];
  Value final_i_0 = loop.getResults()[1];

  auto loop1 = opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy, indexTy}, ValueRange{final_i4_0, final_i_0},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value i4_minus_in = builder.create<arith::SubIOp>(loc, i4, in);
        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sle, i4_minus_in, c0);
        builder.create<scf::ConditionOp>(loc, condition, ValueRange{i4, i});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value xm = builder.create<arith::SubIOp>(loc, in, i4);
        Value xm_2 = builder.create<arith::MulIOp>(loc, xm, c2);
        Value i_2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value xm_2_p1 = builder.create<arith::AddIOp>(loc, xm_2, c1);
        Value i_2_p1 = builder.create<arith::AddIOp>(loc, i_2, c1);

        Value p_xm_2_p1 = builder.create<memref::LoadOp>(loc, p_raw, xm_2_p1);
        Value p_xm_2 = builder.create<memref::LoadOp>(loc, p_raw, xm_2);

        builder.create<memref::StoreOp>(loc, p_xm_2_p1, res, i_2);
        builder.create<memref::StoreOp>(loc, p_xm_2, res, i_2_p1);

        Value i4_next = builder.create<arith::AddIOp>(loc, i4, c4);
        Value i_next = builder.create<arith::AddIOp>(loc, i, c1);
        builder.create<scf::YieldOp>(loc, std::vector<Value>{i4_next, i_next});
      });

  Value final_i4_1 = loop1.getResults()[0];
  Value final_i_1 = loop1.getResults()[1];

  auto loop2 = opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy, indexTy}, ValueRange{final_i4_1, final_i_1},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value in_3 = builder.create<arith::MulIOp>(loc, in, c3);
        Value in_3_m_i4 = builder.create<arith::SubIOp>(loc, in_3, i4);
        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sle, i4, in_3_m_i4);
        builder.create<scf::ConditionOp>(loc, condition, ValueRange{i4, i});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value xm = builder.create<arith::SubIOp>(loc, i4, in);
        Value xm_2 = builder.create<arith::MulIOp>(loc, xm, c2);
        Value i_2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value xm_2_p1 = builder.create<arith::AddIOp>(loc, xm_2, c1);
        Value i_2_p1 = builder.create<arith::AddIOp>(loc, i_2, c1);

        Value p_xm_2_p1 = builder.create<memref::LoadOp>(loc, p_raw, xm_2_p1);
        Value p_xm_2 = builder.create<memref::LoadOp>(loc, p_raw, xm_2);

        Value m_p_xm_2_p1 = builder.create<arith::SubFOp>(loc, f0, p_xm_2_p1);

        builder.create<memref::StoreOp>(loc, m_p_xm_2_p1, res, i_2);
        builder.create<memref::StoreOp>(loc, p_xm_2, res, i_2_p1);

        Value i4_next = builder.create<arith::AddIOp>(loc, i4, c4);
        Value i_next = builder.create<arith::AddIOp>(loc, i, c1);
        builder.create<scf::YieldOp>(loc, std::vector<Value>{i4_next, i_next});
      });

  Value final_i4_2 = loop2.getResults()[0];
  Value final_i_2 = loop2.getResults()[1];

  opBuilder.create<scf::WhileOp>(
      loc, TypeRange{indexTy, indexTy}, ValueRange{final_i4_2, final_i_2},
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value condition = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::slt, i, ndone);
        builder.create<scf::ConditionOp>(loc, condition, ValueRange{i4, i});
      },
      [&](OpBuilder &builder, Location loc, ValueRange args) {
        Value i4 = args[0];
        Value i = args[1];

        Value in_2 = builder.create<arith::MulIOp>(loc, in, c2);

        Value xm = builder.create<arith::SubIOp>(loc, in_2, i4);
        Value xm_2 = builder.create<arith::MulIOp>(loc, xm, c2);
        Value i_2 = builder.create<arith::MulIOp>(loc, i, c2);
        Value xm_2_p1 = builder.create<arith::AddIOp>(loc, xm_2, c1);
        Value i_2_p1 = builder.create<arith::AddIOp>(loc, i_2, c1);

        Value p_xm_2_p1 = builder.create<memref::LoadOp>(loc, p_raw, xm_2_p1);
        Value p_xm_2 = builder.create<memref::LoadOp>(loc, p_raw, xm_2);

        Value m_p_xm_2 = builder.create<arith::SubFOp>(loc, f0, p_xm_2);

        builder.create<memref::StoreOp>(loc, m_p_xm_2, res, i_2);
        builder.create<memref::StoreOp>(loc, p_xm_2_p1, res, i_2_p1);

        Value i4_next = builder.create<arith::AddIOp>(loc, i4, c4);
        Value i_next = builder.create<arith::AddIOp>(loc, i, c1);

        builder.create<scf::YieldOp>(loc, std::vector<Value>{i4_next, i_next});
      });

  return;
}

void fill_first_quadrant(OpBuilder &opBuilder, Location loc, Value n,
                         Value res) {

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove c3, c4, and c5?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);
  Value c8 = opBuilder.create<ConstantIndexOp>(loc, 8);

  FloatType f64Ty = opBuilder.getF64Type();

  Value hsqt2 = opBuilder.create<ConstantFloatOp>(
      loc, APFloat(double(0.707106781186547524400844362104849)), f64Ty);

  Value quart = opBuilder.create<arith::ShRUIOp>(loc, n, c2);
  Value n_mod_8 = opBuilder.create<arith::RemUIOp>(loc, n, c8);

  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, n_mod_8, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        Value quart_plus_1 = builder.create<arith::AddIOp>(loc, quart, c1);
        builder.create<memref::StoreOp>(loc, hsqt2, res, quart);
        builder.create<memref::StoreOp>(loc, hsqt2, res, quart_plus_1);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value two_quart = opBuilder.create<arith::MulIOp>(loc, quart, c2);
  Value two_quart_minus_2 = opBuilder.create<arith::SubIOp>(loc, two_quart, c2);

  opBuilder.create<scf::ForOp>(
      loc, c2, quart, c2, ValueRange{two_quart_minus_2},
      [&](OpBuilder &builder, Location loc, Value i, ValueRange i_args) {
        Value j = i_args[0];

        Value i_plus_1 = builder.create<arith::AddIOp>(loc, i, c1);
        Value j_plus_1 = builder.create<arith::AddIOp>(loc, j, c1);

        Value val_i = builder.create<memref::LoadOp>(loc, res, i);
        Value val_i_plus_1 = builder.create<memref::LoadOp>(loc, res, i_plus_1);

        builder.create<memref::StoreOp>(loc, val_i_plus_1, res, j);
        builder.create<memref::StoreOp>(loc, val_i, res, j_plus_1);

        Value j_next = builder.create<arith::SubIOp>(loc, j, c2);
        builder.create<scf::YieldOp>(loc, j_next);
      });

  return;
}

void fill_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove c3 and c5?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  FloatType f64Ty = opBuilder.getF64Type();
  Value c_1 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(-1.0)), f64Ty);

  Value half = opBuilder.create<arith::ShRUIOp>(loc, n, c1);
  Value n_mod_4 = opBuilder.create<arith::RemUIOp>(loc, n, c4);

  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, n_mod_4, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition,
      [&](OpBuilder &builder, Location loc) {
        builder.create<scf::ForOp>(
            loc, c0, half, c2, std::nullopt,
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value i_plus_1 = b.create<arith::AddIOp>(loc, i, c1);
              Value i_plus_half = b.create<arith::AddIOp>(loc, i, half);
              Value i_plus_half_plus_1 =
                  b.create<arith::AddIOp>(loc, i_plus_half, c1);

              Value val_i = b.create<memref::LoadOp>(loc, res, i);
              Value val_i_plus_1 = b.create<memref::LoadOp>(loc, res, i_plus_1);

              Value neg_val_i_plus_1 =
                  b.create<arith::MulFOp>(loc, val_i_plus_1, c_1);
              b.create<memref::StoreOp>(loc, neg_val_i_plus_1, res,
                                        i_plus_half);
              b.create<memref::StoreOp>(loc, val_i, res, i_plus_half_plus_1);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      },
      [&](OpBuilder &builder, Location loc) {
        Value two_half_minus_2 = builder.create<arith::SubIOp>(loc, half, c1);
        Value two_half_minus_2_mul_2 =
            builder.create<arith::MulIOp>(loc, two_half_minus_2, c2);

        builder.create<scf::ForOp>(
            loc, c2, half, c2, ValueRange{two_half_minus_2_mul_2},
            [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) {
              Value j = i_args[0];
              Value i_plus_1 = builder.create<arith::AddIOp>(loc, i, c1);
              Value j_plus_1 = builder.create<arith::AddIOp>(loc, j, c1);
              Value val_i = b.create<memref::LoadOp>(loc, res, i);
              Value val_i_plus_1 = b.create<memref::LoadOp>(loc, res, i_plus_1);
              Value neg_val_i = b.create<arith::MulFOp>(loc, val_i, c_1);
              b.create<memref::StoreOp>(loc, neg_val_i, res, j);
              b.create<memref::StoreOp>(loc, val_i_plus_1, res, j_plus_1);

              Value j_next = builder.create<arith::SubIOp>(loc, j, c2);
              b.create<scf::YieldOp>(loc, j_next);
            });

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return;
}

void sincos_2pibyn_half(OpBuilder &opBuilder, Location loc, Value n,
                        Value res) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  // TODO: remove the following values?
  // Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  // Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);
  // Value c50 = opBuilder.create<ConstantIndexOp>(loc, 50);

  Value n_mod_4 = opBuilder.create<arith::RemUIOp>(loc, n, c4);

  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, n_mod_4, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition,
      [&](OpBuilder &builder, Location loc) {
        calc_first_octant(builder, loc, n, res, c0);

        fill_first_quadrant(builder, loc, n, res);
        fill_first_half(builder, loc, n, res);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      },
      [&](OpBuilder &builder, Location loc) {
        // TODO: remove the following values?
        // Value n_mod_2 = builder.create<arith::RemUIOp>(loc, n, c2);

        // TODO: remove condition1?
        // Value condition1 = builder.create<arith::CmpIOp>(
        //     loc, arith::CmpIPredicate::eq, n_mod_2, c0);

        opBuilder.create<scf::IfOp>(
            loc, condition,
            [&](OpBuilder &b, Location loc) {
              calc_first_quadrant(b, loc, n, res);
              fill_first_half(b, loc, n, res);
              b.create<scf::YieldOp>(loc, std::nullopt);
            },
            [&](OpBuilder &b, Location loc) {
              calc_first_half(b, loc, n, res);
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

// calcuate the twiddle factors for the input length
Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length,
                         Value Rfftp_fctdata_fct, Value Rfftp_fctdata_tw,
                         Value Rfftp_fctdata_tws, Value Rfftp_plan_length,
                         Value Rfftp_plan_nfct, Value Rfftp_plan_mem) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  // TODO: remove the following values?
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);
  // Value c50 = opBuilder.create<ConstantIndexOp>(loc, 50);

  Value length_2 = opBuilder.create<arith::MulIOp>(loc, length, c2);
  FloatType f64Ty = opBuilder.getF64Type();

  Value twid = opBuilder.create<memref::AllocOp>(
      loc, MemRefType::get(ShapedType::kDynamic, f64Ty),
      /*dynamicOperands=*/length_2);

  Value plan_nfct = opBuilder.create<memref::LoadOp>(loc, Rfftp_plan_nfct, c0);

  sincos_2pibyn_half(opBuilder, loc, length, twid);

  Value l1_start = opBuilder.create<ConstantIndexOp>(loc, 1);

  opBuilder.create<scf::ForOp>(
      loc, c0, plan_nfct, c1, ValueRange{l1_start},
      [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) {
        Value l1 = k_args[0];

        Value ip = builder.create<memref::LoadOp>(loc, Rfftp_fctdata_fct, k);

        Value l1_m_ip = builder.create<arith::MulIOp>(loc, l1, ip);
        Value ido = builder.create<arith::DivSIOp>(loc, length, l1_m_ip);
        Value plan_nfct_m1 = builder.create<arith::SubIOp>(loc, plan_nfct, c1);

        Value condition1 = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::slt, k, plan_nfct_m1);

        builder.create<scf::IfOp>(
            loc, condition1, [&](OpBuilder &b, Location loc) {
              Value ido_m1 = b.create<arith::SubIOp>(loc, ido, c1);
              Value ido_m1_d2 = b.create<arith::DivSIOp>(loc, ido_m1, c2);
              Value ido_m1_d2_p1 = b.create<arith::AddIOp>(loc, ido_m1_d2, c1);

              b.create<scf::ForOp>(
                  loc, c1, ip, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value j, ValueRange j_args) {
                    b2.create<scf::ForOp>(
                        loc, c1, ido_m1_d2_p1, c1, std::nullopt,
                        [&](OpBuilder &b3, Location loc, Value i,
                            ValueRange i_args) {
                          Value j2 = b3.create<arith::MulIOp>(loc, j, c2);
                          Value j2_l1 = b3.create<arith::MulIOp>(loc, j2, l1);
                          Value j2_l1_i =
                              b3.create<arith::MulIOp>(loc, j2_l1, i);
                          Value j2_l1_i_p1 =
                              b3.create<arith::AddIOp>(loc, j2_l1_i, c1);

                          Value j_m1 = b3.create<arith::SubIOp>(loc, j, c1);
                          Value ido_m1_j_m1 =
                              b3.create<arith::MulIOp>(loc, ido_m1, j_m1);

                          Value i2 = b3.create<arith::MulIOp>(loc, i, c2);
                          Value i2_m1 = b3.create<arith::SubIOp>(loc, i2, c1);
                          Value i2_m2 = b3.create<arith::SubIOp>(loc, i2, c2);

                          Value tw_a =
                              b3.create<arith::AddIOp>(loc, ido_m1_j_m1, i2_m2);
                          Value tw_b =
                              b3.create<arith::AddIOp>(loc, ido_m1_j_m1, i2_m1);

                          Value twid_a =
                              b3.create<memref::LoadOp>(loc, twid, j2_l1_i);
                          Value twid_b =
                              b3.create<memref::LoadOp>(loc, twid, j2_l1_i_p1);

                          Value fct_k = b3.create<memref::LoadOp>(
                              loc, Rfftp_fctdata_tw, k);

                          b3.create<memref::StoreOp>(loc, twid_a, fct_k, tw_a);
                          b3.create<memref::StoreOp>(loc, twid_b, fct_k, tw_b);

                          b3.create<scf::YieldOp>(loc, std::nullopt);
                        });
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value condition2 = builder.create<arith::CmpIOp>(
            loc, arith::CmpIPredicate::sgt, ip, c5);

        builder.create<scf::IfOp>(
            loc, condition2, [&](OpBuilder &b, Location loc) {
              Value fct_k = b.create<memref::LoadOp>(loc, Rfftp_fctdata_tws, k);
              Value c_f0 =
                  b.create<ConstantFloatOp>(loc, APFloat(double(0.0)), f64Ty);
              Value c_f1 =
                  b.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);

              b.create<memref::StoreOp>(loc, c_f1, fct_k, c0);
              b.create<memref::StoreOp>(loc, c_f0, fct_k, c1);

              Value ip_div_2 = b.create<arith::ShRSIOp>(loc, ip, c1);
              Value ip_div_2_p1 = b.create<arith::AddIOp>(loc, ip_div_2, c1);

              b.create<scf::ForOp>(
                  loc, c1, ip_div_2_p1, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value i, ValueRange i_args) {
                    Value i2 = b2.create<arith::MulIOp>(loc, i, c2);
                    Value i2_p1 = b2.create<arith::AddIOp>(loc, i2, c1);
                    Value ip_m_i = b2.create<arith::SubIOp>(loc, ip, i);
                    Value ip_m_i_2 = b2.create<arith::MulIOp>(loc, ip_m_i, c2);
                    Value ip_m_i_2_p1 =
                        b2.create<arith::AddIOp>(loc, ip_m_i_2, c1);

                    Value length_div_ip =
                        b2.create<arith::DivSIOp>(loc, length, ip);
                    Value i2_length_div_ip =
                        b2.create<arith::MulIOp>(loc, i2, length_div_ip);
                    Value i2_length_div_ip_p1 =
                        b2.create<arith::AddIOp>(loc, i2_length_div_ip, c1);

                    Value twid_a =
                        b2.create<memref::LoadOp>(loc, twid, i2_length_div_ip);
                    Value twid_b = b2.create<memref::LoadOp>(
                        loc, twid, i2_length_div_ip_p1);
                    Value twid_c = b2.create<arith::AddFOp>(loc, c_f0, twid_a);
                    Value twid_d = b2.create<arith::SubFOp>(loc, c_f0, twid_b);

                    b2.create<memref::StoreOp>(loc, twid_a, fct_k, i2);
                    b2.create<memref::StoreOp>(loc, twid_b, fct_k, i2_p1);
                    b2.create<memref::StoreOp>(loc, twid_c, fct_k, ip_m_i_2);
                    b2.create<memref::StoreOp>(loc, twid_d, fct_k, ip_m_i_2_p1);
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              b.create<scf::YieldOp>(loc, std::nullopt);
            });

        Value l1_next = builder.create<arith::MulIOp>(loc, l1, ip);
        builder.create<scf::YieldOp>(loc, l1_next);
      });

  opBuilder.create<memref::DeallocOp>(loc, twid);

  return c0;
}

// calculate the twiddle factors and generates the computation order of
// butterfly operators
std::vector<Value> make_rfftp_plan(OpBuilder &opBuilder, Location loc,
                                   Value length) {

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // TODO: remove the following values?
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  int64_t NFCT_num = 25;
  Value NFCT = opBuilder.create<ConstantIndexOp>(loc, NFCT_num);

  FloatType f64Ty = opBuilder.getF64Type();
  IndexType indexTy = opBuilder.getIndexType();

  Value length_2 = opBuilder.create<arith::MulIOp>(loc, length, c2);

  MemRefType type = MemRefType::get(NFCT_num, indexTy);
  //   MemRefType type1 = MemRefType::get(length_num2, f64Ty);
  MemRefType type1 = MemRefType::get(ShapedType::kDynamic, f64Ty);
  MemRefType type2 = MemRefType::get(NFCT_num, type1);
  MemRefType type3 = MemRefType::get(1, indexTy);
  MemRefType type4 = MemRefType::get(1, f64Ty);

  Value Rfftp_fctdata_fct = opBuilder.create<memref::AllocOp>(loc, type);
  Value Rfftp_fctdata_tw = opBuilder.create<memref::AllocOp>(loc, type2);
  Value Rfftp_fctdata_tws = opBuilder.create<memref::AllocOp>(loc, type2);
  Value Rfftp_plan_length = opBuilder.create<memref::AllocOp>(loc, type3);
  Value Rfftp_plan_nfct = opBuilder.create<memref::AllocOp>(loc, type3);
  Value Rfftp_plan_mem = opBuilder.create<memref::AllocOp>(loc, type4);

  opBuilder.create<memref::StoreOp>(loc, length, Rfftp_plan_length, c0);
  opBuilder.create<memref::StoreOp>(loc, c0, Rfftp_plan_nfct, c0);

  opBuilder.create<scf::ForOp>(
      loc, c0, NFCT, c1, std::nullopt,
      [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs) {
        builder.create<memref::StoreOp>(loc, c0, Rfftp_fctdata_fct, i);

        Value tw_i = builder.create<memref::AllocOp>(
            loc, type1, /*dynamicOperands=*/length_2);
        builder.create<memref::StoreOp>(loc, tw_i, Rfftp_fctdata_tw, i);
        Value tws_i = builder.create<memref::AllocOp>(
            loc, type1, /*dynamicOperands=*/length_2);
        builder.create<memref::StoreOp>(loc, tws_i, Rfftp_fctdata_tws, i);

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::ne, length, c1);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        // TODO: remove xxx?
        // Value xxx = builder.create<ConstantIndexOp>(loc, 1);
        rfftp_factorize(builder, loc, Rfftp_fctdata_fct, Rfftp_fctdata_tw,
                        Rfftp_fctdata_tws, Rfftp_plan_length, Rfftp_plan_nfct,
                        Rfftp_plan_mem);
        rfftp_comp_twiddle(builder, loc, length, Rfftp_fctdata_fct,
                           Rfftp_fctdata_tw, Rfftp_fctdata_tws,
                           Rfftp_plan_length, Rfftp_plan_nfct, Rfftp_plan_mem);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });

  return {Rfftp_fctdata_fct, Rfftp_fctdata_tw, Rfftp_fctdata_tws,
          Rfftp_plan_length, Rfftp_plan_nfct,  Rfftp_plan_mem};
}

void memref_SWAP(OpBuilder &opBuilder, Location loc, Value p, Value p1) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  // TODO: remove the following values?
  // Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  // Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  // Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);

  Value length = opBuilder.create<memref::DimOp>(loc, p, c0);

  opBuilder.create<scf::ForOp>(
      loc, c0, length, c1, std::nullopt,
      [&](OpBuilder builder, Location loc, Value i, ValueRange i_args) {
        Value val_p = builder.create<memref::LoadOp>(loc, p, i);
        Value val_p1 = builder.create<memref::LoadOp>(loc, p1, i);

        builder.create<memref::StoreOp>(loc, val_p, p1, i);
        builder.create<memref::StoreOp>(loc, val_p1, p, i);
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

void flag_SWAP(OpBuilder &opBuilder, Location loc, Value flag) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);

  Value val = opBuilder.create<memref::LoadOp>(loc, flag, c0);
  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, val, c0);

  Value x = opBuilder.create<arith::SelectOp>(loc, condition, c1, c0);

  opBuilder.create<memref::StoreOp>(loc, x, flag, c0);
}

void copy_and_norm(OpBuilder &opBuilder, Location loc, Value c, Value p1,
                   Value n, Value fct, Value flag) {
  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  // TODO: remove the following values?
  // Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  // Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  FloatType f64Ty = opBuilder.getF64Type();
  Value f1 =
      opBuilder.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);

  Value flag_val = opBuilder.create<memref::LoadOp>(loc, flag, c0);
  Value condition = opBuilder.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::eq, flag_val, c0);

  opBuilder.create<scf::IfOp>(
      loc, condition,
      [&](OpBuilder &builder, Location loc) {
        Value condition1 = builder.create<arith::CmpFOp>(
            loc, arith::CmpFPredicate::ONE, fct, f1);
        builder.create<scf::IfOp>(
            loc, condition1,
            [&](OpBuilder &b, Location loc) {
              b.create<scf::ForOp>(
                  loc, c0, n, c1, std::nullopt,
                  [&](OpBuilder b2, Location loc, Value i, ValueRange i_args) {
                    Value p1_i = b2.create<memref::LoadOp>(loc, p1, i);
                    Value v = b2.create<arith::MulFOp>(loc, fct, p1_i);
                    b2.create<memref::StoreOp>(loc, v, c, i);
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });
              b.create<scf::YieldOp>(loc, std::nullopt);
            },
            [&](OpBuilder &b, Location loc) {
              b.create<scf::ForOp>(
                  loc, c0, n, c1, std::nullopt,
                  [&](OpBuilder b2, Location loc, Value i, ValueRange i_args) {
                    Value val = b2.create<memref::LoadOp>(loc, p1, i);
                    b2.create<memref::StoreOp>(loc, val, c, i);
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      },
      [&](OpBuilder &builder, Location loc) {
        Value condition2 = builder.create<arith::CmpFOp>(
            loc, arith::CmpFPredicate::ONE, fct, f1);
        builder.create<scf::IfOp>(
            loc, condition2, [&](OpBuilder &b, Location loc) {
              b.create<scf::ForOp>(
                  loc, c0, n, c1, std::nullopt,
                  [&](OpBuilder &b2, Location loc, Value i, ValueRange i_args) {
                    Value c_i = b2.create<memref::LoadOp>(loc, c, i);
                    Value newC = b2.create<arith::MulFOp>(loc, fct, c_i);
                    b2.create<memref::StoreOp>(loc, newC, c, i);
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });
              b.create<scf::YieldOp>(loc, std::nullopt);
            });
        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

// FFT forward function for real number
void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct,
                   Value Rfftp_fctdata_tw, Value Rfftp_fctdata_tws,
                   Value Rfftp_plan_length, Value Rfftp_plan_nfct,
                   Value Rfftp_plan_mem, Value c, Value fct) {

  Value c0 = opBuilder.create<ConstantIndexOp>(loc, 0);
  Value c1 = opBuilder.create<ConstantIndexOp>(loc, 1);
  Value c2 = opBuilder.create<ConstantIndexOp>(loc, 2);
  Value c3 = opBuilder.create<ConstantIndexOp>(loc, 3);
  Value c4 = opBuilder.create<ConstantIndexOp>(loc, 4);
  Value c5 = opBuilder.create<ConstantIndexOp>(loc, 5);
  // TODO: remove the following values?
  // Value c20 = opBuilder.create<ConstantIndexOp>(loc, 20);
  FloatType f64Ty = opBuilder.getF64Type();

  Value n = opBuilder.create<memref::LoadOp>(loc, Rfftp_plan_length, c0);

  Value condition =
      opBuilder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, n, c1);

  opBuilder.create<scf::IfOp>(
      loc, condition, [&](OpBuilder &builder, Location loc) {
        Value flag = builder.create<memref::AllocOp>(
            loc, MemRefType::get(1, builder.getIndexType()));
        builder.create<memref::StoreOp>(loc, c1, flag, c0);
        Value l1_raw = builder.create<arith::AddIOp>(loc, n, c0);
        Value nf = builder.create<memref::LoadOp>(loc, Rfftp_plan_nfct, c0);

        MemRefType cType = dyn_cast<MemRefType>(c.getType());
        Value dimSize = builder.create<memref::DimOp>(loc, c, 0);
        Value ch = builder.create<memref::AllocOp>(loc, cType,
                                                   /*dynamicOperands=*/dimSize);

        // Value ch = builder.create<memref::AllocOp>(
        //     loc, MemRefType::get(cType.getShape(), f64Ty));

        FailureOr<StridedLayoutAttr> computelayout =
            StridedLayoutAttr::get(opBuilder.getContext(),
                                   /*offset=*/ShapedType::kDynamic,
                                   /*strides=*/{ShapedType::kDynamic});
        MemRefType resultType =
            MemRefType::get(ShapedType::kDynamic, f64Ty, *computelayout);

        // memref<?xf64, strided<[?], offset: ?>>

        Value p1_raw = builder.create<memref::SubViewOp>(
            loc, resultType, c, SmallVector<OpFoldResult>{c0},
            SmallVector<OpFoldResult>{n}, SmallVector<OpFoldResult>{c1});

        Value p2_raw = builder.create<memref::SubViewOp>(
            loc, resultType, ch, SmallVector<OpFoldResult>{c0},
            SmallVector<OpFoldResult>{n}, SmallVector<OpFoldResult>{c1});

        builder.create<scf::ForOp>(
            loc, c0, nf, c1, ValueRange{l1_raw},
            [&](OpBuilder b, Location loc, Value k1, ValueRange k1_args) {
              Value l1_old = k1_args[0];

              Value nf_m_k1 = b.create<arith::SubIOp>(loc, nf, k1);
              Value k = b.create<arith::SubIOp>(loc, nf_m_k1, c1);
              Value ip = b.create<memref::LoadOp>(loc, Rfftp_fctdata_fct, k);
              Value ido = b.create<arith::DivSIOp>(loc, n, l1_old);
              Value l1 = b.create<arith::DivSIOp>(loc, l1_old, ip);

              Value tw = b.create<memref::LoadOp>(loc, Rfftp_fctdata_tw, k);

              Value condition1 = b.create<arith::CmpIOp>(
                  loc, arith::CmpIPredicate::eq, ip, c4);

              b.create<scf::IfOp>(
                  loc, condition1,
                  [&](OpBuilder &b2, Location loc) {
                    radf4(b2, loc, p1_raw, p2_raw, tw, ido, l1, c0, c1, c2, c3);
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  },
                  [&](OpBuilder &b2, Location loc) {
                    Value condition2 = b2.create<arith::CmpIOp>(
                        loc, arith::CmpIPredicate::eq, ip, c2);
                    b2.create<scf::IfOp>(
                        loc, condition2,
                        [&](OpBuilder &b3, Location loc) {
                          radf2(b3, loc, p1_raw, p2_raw, tw, ido, l1);
                          b3.create<scf::YieldOp>(loc, std::nullopt);
                        },
                        [&](OpBuilder &b3, Location loc) {
                          Value condition3 = b3.create<arith::CmpIOp>(
                              loc, arith::CmpIPredicate::eq, ip, c3);
                          b3.create<scf::IfOp>(
                              loc, condition3,
                              [&](OpBuilder &b4, Location loc) {
                                radf3(b4, loc, p1_raw, p2_raw, tw, ido, l1);
                                b4.create<scf::YieldOp>(loc, std::nullopt);
                              },
                              [&](OpBuilder &b4, Location loc) {
                                Value condition4 = b4.create<arith::CmpIOp>(
                                    loc, arith::CmpIPredicate::eq, ip, c5);
                                b4.create<scf::IfOp>(
                                    loc, condition4,
                                    [&](OpBuilder &b5, Location loc) {
                                      radf5(b5, loc, p1_raw, p2_raw, tw, ido,
                                            l1, c0, c1, c2, c3, c4);
                                      b5.create<scf::YieldOp>(loc,
                                                              std::nullopt);
                                    },
                                    [&](OpBuilder &b5, Location loc) {
                                      Value tws = b5.create<memref::LoadOp>(
                                          loc, Rfftp_fctdata_tws, k);
                                      radfg(b5, loc, p1_raw, p2_raw, tw, tws,
                                            ido, ip, l1);
                                      memref_SWAP(b5, loc, p1_raw, p2_raw);
                                      flag_SWAP(b5, loc, flag);
                                      b5.create<scf::YieldOp>(loc,
                                                              std::nullopt);
                                    });
                                b4.create<scf::YieldOp>(loc, std::nullopt);
                              });
                          b3.create<scf::YieldOp>(loc, std::nullopt);
                        }

                    );
                    b2.create<scf::YieldOp>(loc, std::nullopt);
                  });

              memref_SWAP(b, loc, p1_raw, p2_raw);
              flag_SWAP(b, loc, flag);

              b.create<scf::YieldOp>(loc, l1);
            });

        copy_and_norm(builder, loc, c, p1_raw, n, fct, flag);

        builder.create<scf::YieldOp>(loc, std::nullopt);
      });
}

// Calculate abspower of bufferMem and store result to a specific line in the
// resultMem
void absPower(OpBuilder &builder, Location loc, Value bufferMem,
              Value resultMem, Value idx, Value c0, Value c1, Value c2) {
  Value c200 = builder.create<ConstantIndexOp>(loc, 200);
  Value c398 = builder.create<ConstantIndexOp>(loc, 398);
  Value c399 = builder.create<ConstantIndexOp>(loc, 399);
  Value power = builder.create<ConstantIndexOp>(loc, 2);

  Value firstNum = builder.create<memref::LoadOp>(loc, bufferMem, c0);
  Value firstPow = builder.create<math::FPowIOp>(loc, firstNum, power);
  builder.create<memref::StoreOp>(loc, firstPow, resultMem,
                                  ValueRange{idx, c0});

  Value lastNum = builder.create<memref::LoadOp>(loc, bufferMem, c399);
  Value lastPow = builder.create<math::FPowIOp>(loc, lastNum, power);
  builder.create<memref::StoreOp>(loc, lastPow, resultMem,
                                  ValueRange{idx, c200});

  builder.create<scf::ForOp>(
      loc, c1, c398, c2, ValueRange{c1},
      [&](OpBuilder &b, Location loc, Value iv, ValueRange iargs) {
        Value j = b.create<arith::AddIOp>(loc, iv, c1);
        Value num1 = b.create<memref::LoadOp>(loc, bufferMem, iv);
        Value num2 = b.create<memref::LoadOp>(loc, bufferMem, j);
        Value pow1 = b.create<math::FPowIOp>(loc, num1, power);
        Value pow2 = b.create<math::FPowIOp>(loc, num2, power);
        Value add = b.create<arith::AddFOp>(loc, pow1, pow2);
        b.create<memref::StoreOp>(loc, add, resultMem,
                                  ValueRange{idx, iargs[0]});

        Value indexNext = b.create<arith::AddIOp>(loc, iargs[0], c1);

        b.create<scf::YieldOp>(loc, indexNext);
      });

  return;
}

// Compute Log Mel Spectrogram
Value spectrogram(PatternRewriter &rewriter, Location loc, Value f0, Value c0,
                  Value c1, Value c2, Value c3, Value c4, Value c5, Value input,
                  Value window, Value melFilters) {
  FloatType f64Ty = rewriter.getF64Type();

  Value numFrames = rewriter.create<ConstantIndexOp>(loc, 3001);
  Value hopLength = rewriter.create<ConstantIndexOp>(loc, 160);
  Value c400 = rewriter.create<ConstantIndexOp>(loc, 400);

  MemRefType spectrogramTy = MemRefType::get({3001, 201}, f64Ty);
  Value spectrogram = rewriter.create<memref::AllocOp>(loc, spectrogramTy);

  RankedTensorType tensorTy0 = RankedTensorType::get({400}, f64Ty);
  MemRefType mTp = MemRefType::get({400}, f64Ty);

  // #mulf_trait for 'linalg.generic' operation.
  AffineMap mulFIdMap =
      AffineMap::getMultiDimIdentityMap(1, rewriter.getContext());
  SmallVector<AffineMap> mulFIndexingMaps = {mulFIdMap, mulFIdMap, mulFIdMap};
  SmallVector<utils::IteratorType> mulFIteratorTypes = {
      utils::IteratorType::parallel};

  rewriter.create<scf::ForOp>(
      loc, c0, numFrames, c1, ValueRange{c0},
      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
        auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
            loc, input, iargs[0], c400, c1);
        Value buffer400 = extractSliceOp.getResult();
        Value buffer =
            rewriter.create<tensor::CastOp>(loc, tensorTy0, buffer400);

        // 'linalg.generic' operation use #mulf_trait.
        auto mulfOp = rewriter.create<linalg::GenericOp>(
            loc, /*resultTensorTypes=*/tensorTy0,
            /*inputs=*/ValueRange{buffer, window},
            /*outputs=*/ValueRange{buffer}, mulFIndexingMaps, mulFIteratorTypes,
            [&](OpBuilder &b, Location loc, ValueRange args) {
              Value elem = b.create<arith::MulFOp>(loc, args[0], args[1]);
              b.create<linalg::YieldOp>(loc, elem);
            });
        Value multiplied = mulfOp.getResult(0);

        Value bufferMem_raw =
            builder.create<bufferization::ToMemrefOp>(loc, mTp, multiplied);

        MemRefType type0 = MemRefType::get({400}, f64Ty);
        MemRefType type1 = MemRefType::get(ShapedType::kDynamic, f64Ty);

        Value bufferMem_rfft =
            builder.create<memref::CastOp>(loc, type1, bufferMem_raw);

        // Compute 'dap.rfft' operation, result stores in `bufferMem`.
        builder.create<dap::RFFTOp>(loc, bufferMem_rfft);

        Value bufferMem =
            builder.create<memref::CastOp>(loc, type0, bufferMem_rfft);

        // Store the result in a single line specified by `iv`.
        absPower(builder, loc, bufferMem, spectrogram, iv, c0, c1, c2);

        Value timestepNext =
            builder.create<arith::AddIOp>(loc, iargs[0], hopLength);

        builder.create<scf::YieldOp>(loc, timestepNext);
      });

  // TODO: check alloc and dealloc
  // MemRefType melFiltersTransposeTy = MemRefType::get({80, 201}, f64Ty);
  // Value alloc0 = rewriter.create<memref::AllocOp>(loc,
  // melFiltersTransposeTy); Value init0 =
  // rewriter.create<bufferization::ToTensorOp>(loc, alloc0);
  Value init0 =
      rewriter.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>{80, 201}, f64Ty);
  auto transposeOp0 = rewriter.create<linalg::TransposeOp>(
      loc, /*input=*/melFilters,
      /*init=*/init0,
      /*permutation=*/ArrayRef<int64_t>{1, 0});
  Value melFiltersT = transposeOp0.getResult()[0];

  Value gram = rewriter.create<bufferization::ToTensorOp>(
      loc, spectrogram, /*restrict=*/true, /*writable=*/false);
  Value init1 = rewriter.create<tensor::EmptyOp>(
      loc, ArrayRef<int64_t>{201, 3001}, f64Ty);
  auto transposeOp1 = rewriter.create<linalg::TransposeOp>(
      loc, /*input=*/gram,
      /*init=*/init1,
      /*permutation=*/ArrayRef<int64_t>{1, 0});
  Value spectrogramT = transposeOp1.getResult()[0];

  rewriter.create<memref::DeallocOp>(loc, spectrogram);

  Value init2 =
      rewriter.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>{80, 3001}, f64Ty);
  auto matmulOp = rewriter.create<linalg::MatmulOp>(
      loc, /*inputs=*/ValueRange{melFiltersT, spectrogramT},
      /*outputs=*/ValueRange{init2});
  Value matMulResult = matmulOp.getResultTensors()[0];

  // Initialize a tensor with constant `1e-10`.
  RankedTensorType tensorTy1 = RankedTensorType::get({80, 3001}, f64Ty);
  Value cMelFloor = rewriter.create<ConstantFloatOp>(
      loc, APFloat(double(0.0000000001)), f64Ty);
  Value melFloor = rewriter.create<tensor::SplatOp>(loc, tensorTy1, cMelFloor);

  auto linalgMaxOp = rewriter.create<linalg::MaxOp>(
      loc, /*input=*/ValueRange{melFloor, matMulResult},
      /*outputs=*/ValueRange{melFloor});
  Value spectrogramMax = linalgMaxOp.getResultTensors()[0];

  // #log10_trait for 'linalg.generic' operation.
  AffineMap log10IdMap =
      AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
  SmallVector<AffineMap> log10IndexingMaps = {log10IdMap, log10IdMap};
  SmallVector<utils::IteratorType> log10IteratorTypes = {
      utils::IteratorType::parallel, utils::IteratorType::parallel};

  // 'linalg.generic' operation use #log10_trait.
  auto log10Op = rewriter.create<linalg::GenericOp>(
      loc, /*resultTensorTypes=*/tensorTy1,
      /*inputs=*/ValueRange{spectrogramMax},
      /*outputs=*/ValueRange{spectrogramMax}, log10IndexingMaps,
      log10IteratorTypes, [&](OpBuilder &b, Location loc, ValueRange args) {
        Value elem = b.create<math::Log10Op>(loc, args[0]);
        b.create<linalg::YieldOp>(loc, elem);
      });
  Value spectrogramLog10 = log10Op.getResult(0);

  return spectrogramLog10;
}

namespace {
class DAPRFFTLowering : public OpRewritePattern<dap::RFFTOp> {
public:
  using OpRewritePattern<dap::RFFTOp>::OpRewritePattern;

  explicit DAPRFFTLowering(MLIRContext *context) : OpRewritePattern(context) {}

  LogicalResult matchAndRewrite(dap::RFFTOp op,
                                PatternRewriter &rewriter) const override {
    auto loc = op->getLoc();
    // TODO: remove the following values?
    // auto ctx = op->getContext();
    Value bufferMem = op->getOperand(0);

    Value c0 = rewriter.create<ConstantIndexOp>(loc, 0);
    // TODO: remove the following values?
    // Value c1 = rewriter.create<ConstantIndexOp>(loc, 1);
    // Value c2 = rewriter.create<ConstantIndexOp>(loc, 2);
    // Value c3 = rewriter.create<ConstantIndexOp>(loc, 3);
    // Value c4 = rewriter.create<ConstantIndexOp>(loc, 4);
    // Value c5 = rewriter.create<ConstantIndexOp>(loc, 5);
    // Value c9 = rewriter.create<ConstantIndexOp>(loc, 9);
    // Value c24 = rewriter.create<ConstantIndexOp>(loc, 24);
    // Value c25 = rewriter.create<ConstantIndexOp>(loc, 25);
    // Value c50 = rewriter.create<ConstantIndexOp>(loc, 50);

    Value inputFeatures = rewriter.create<bufferization::ToTensorOp>(
        loc, bufferMem, /*restrict=*/true, /*writable=*/true);
    Value inputFeaturesSize =
        rewriter.create<tensor::DimOp>(loc, inputFeatures, c0);

    FloatType f64Ty = rewriter.getF64Type();

    // TODO: remove the following values?
    // Value f0 =
    //     rewriter.create<ConstantFloatOp>(loc, APFloat(double(0.0)), f64Ty);
    Value f1 =
        rewriter.create<ConstantFloatOp>(loc, APFloat(double(1.0)), f64Ty);

    std::vector<Value> plan = make_rfftp_plan(rewriter, loc, inputFeaturesSize);

    Value Rfftp_fctdata_fct = plan[0];
    Value Rfftp_fctdata_tw = plan[1];
    Value Rfftp_fctdata_tws = plan[2];
    Value Rfftp_plan_length = plan[3];
    Value Rfftp_plan_nfct = plan[4];
    Value Rfftp_plan_mem = plan[5];

    rfftp_forward(rewriter, loc, Rfftp_fctdata_fct, Rfftp_fctdata_tw,
                  Rfftp_fctdata_tws, Rfftp_plan_length, Rfftp_plan_nfct,
                  Rfftp_plan_mem, bufferMem, f1);

    rewriter.eraseOp(op);
    return success();
  }
};

class DAPWhisperPreprocessLowering
    : public OpRewritePattern<dap::WhisperPreprocessOp> {
public:
  using OpRewritePattern<dap::WhisperPreprocessOp>::OpRewritePattern;

  explicit DAPWhisperPreprocessLowering(MLIRContext *context)
      : OpRewritePattern(context) {}

  LogicalResult matchAndRewrite(dap::WhisperPreprocessOp op,
                                PatternRewriter &rewriter) const override {
    auto loc = op->getLoc();
    auto ctx = op->getContext();
    Value input = op->getOperand(0);

    Value c0 = rewriter.create<ConstantIndexOp>(loc, 0);
    Value c1 = rewriter.create<ConstantIndexOp>(loc, 1);
    Value c2 = rewriter.create<ConstantIndexOp>(loc, 2);
    Value c3 = rewriter.create<ConstantIndexOp>(loc, 3);
    Value c4 = rewriter.create<ConstantIndexOp>(loc, 4);
    Value c5 = rewriter.create<ConstantIndexOp>(loc, 5);
    Value c80 = rewriter.create<ConstantIndexOp>(loc, 80);
    Value c3000 = rewriter.create<ConstantIndexOp>(loc, 3000);
    Value c480000 = rewriter.create<ConstantIndexOp>(loc, 480000);

    FloatType f32 = Float32Type::get(ctx);
    FloatType f64 = Float64Type::get(ctx);

    Value inputFeatures = rewriter.create<bufferization::ToTensorOp>(
        loc, input, /*restrict=*/true, /*writable=*/false);
    Value inputFeaturesSize =
        rewriter.create<tensor::DimOp>(loc, inputFeatures, c0);
    Value padConstantHigh =
        rewriter.create<arith::SubIOp>(loc, c480000, inputFeaturesSize);

    // Pad inputFeatures to MaxLength = 480000
    SmallVector<int64_t> paddedShape;
    paddedShape.push_back(480000);

    SmallVector<OpFoldResult> lowValues;
    SmallVector<OpFoldResult> highValues;
    lowValues.push_back(c0);
    highValues.push_back(padConstantHigh);

    Value f0 =
        rewriter.create<arith::ConstantFloatOp>(loc, APFloat(double(0.0)), f64);
    auto padConstantOp = rewriter.create<tensor::PadOp>(
        loc, RankedTensorType::get(paddedShape, f64), inputFeatures, lowValues,
        highValues, f0);
    Value paddedInput = padConstantOp.getResult();

    // Generate melFilter with 391 numbers
    Value melFilter = initMelFilter(rewriter, loc, c0, c1, f0);

    // Generate hanning window with length 400
    Value window = getHanningWindow400(rewriter, loc);

    // Reflect pad for paddedInput, both left and right part pad with length 200
    Value finalPaddedInput =
        padReflect(rewriter, loc, c0, c1, paddedInput, 200, 200);
    Value logSpec = spectrogram(rewriter, loc, f0, c0, c1, c2, c3, c4, c5,
                                finalPaddedInput, window, melFilter);

    auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
        loc, /*source=*/logSpec,
        /*offsets=*/ValueRange{c0, c0},
        /*sizes=*/ValueRange{c80, c3000},
        /*strides=*/ValueRange{c1, c1});
    Value logSpecCut = extractSliceOp.getResult();

    Value maxInit =
        rewriter.create<ConstantFloatOp>(loc, APFloat(double(-10.0)), f64);
    auto forOp0 = rewriter.create<scf::ForOp>(
        loc, c0, c80, c1, maxInit,
        [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs0) {
          auto forOp1 = builder.create<scf::ForOp>(
              loc, c0, c3000, c1, iargs0[0],
              [&](OpBuilder &b, Location loc, Value j, ValueRange iargs1) {
                Value elem = b.create<tensor::ExtractOp>(loc, logSpecCut,
                                                         ValueRange{i, j});
                Value larger =
                    b.create<arith::MaximumFOp>(loc, elem, iargs1[0]);
                b.create<scf::YieldOp>(loc, larger);
              });

          Value maxNext = forOp1.getResults()[0];
          builder.create<scf::YieldOp>(loc, maxNext);
        });
    Value maxNum = forOp0.getResults()[0];

    Value f8 = rewriter.create<ConstantFloatOp>(loc, APFloat(double(8.0)), f64);
    Value maxNumMinus8 = rewriter.create<arith::SubFOp>(loc, maxNum, f8);
    Value logSpecFloor = rewriter.create<tensor::SplatOp>(
        loc, RankedTensorType::get({80, 3000}, f64), maxNumMinus8);

    auto linalgMaxOp = rewriter.create<linalg::MaxOp>(
        loc, /*input=*/ValueRange{logSpecCut, logSpecFloor},
        /*outputs=*/ValueRange{logSpecFloor});
    Value logSpecMax = linalgMaxOp.getResultTensors()[0];

    Value f0F32 =
        rewriter.create<ConstantFloatOp>(loc, APFloat(float(0.0)), f32);
    Value f4 = rewriter.create<ConstantFloatOp>(loc, APFloat(double(4.0)), f64);
    RankedTensorType resultTy = RankedTensorType::get({80, 3000}, f32);
    Value InputFeaturesF32 =
        rewriter.create<tensor::SplatOp>(loc, resultTy, f0F32);

    // #tail_processing_trait for 'linalg.generic' operation.
    AffineMap IdMap =
        AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
    SmallVector<AffineMap> IndexingMaps = {IdMap, IdMap};
    SmallVector<utils::IteratorType> IteratorTypes = {
        utils::IteratorType::parallel, utils::IteratorType::parallel};

    // 'linalg.generic' operation use #tail_processing_trait.
    auto tailProcessOp = rewriter.create<linalg::GenericOp>(
        loc, /*resultTensorTypes=*/resultTy,
        /*inputs=*/ValueRange{logSpecMax},
        /*outputs=*/ValueRange{InputFeaturesF32}, IndexingMaps, IteratorTypes,
        [&](OpBuilder &b, Location loc, ValueRange args) {
          Value add4 = b.create<arith::AddFOp>(loc, args[0], f4);
          Value div4 = b.create<arith::DivFOp>(loc, add4, f4);
          Value elem = b.create<arith::TruncFOp>(loc, f32, div4);
          b.create<linalg::YieldOp>(loc, elem);
        });
    Value result = tailProcessOp.getResult(0);

    // Compute reassociation indices [[0, 1], 2]
    SmallVector<SmallVector<int64_t, 2>> reassociationIndices(
        resultTy.getRank());
    int64_t index = 0;
    for (index = 0; index <= 1; index++) {
      reassociationIndices[0].push_back(index);
    }
    reassociationIndices[1].push_back(index);

    RankedTensorType expandTy = RankedTensorType::get({1, 80, 3000}, f32);

    Value resultExpand = rewriter.create<tensor::ExpandShapeOp>(
        loc, /*resultType=*/expandTy, /*src=*/result,
        /*reassociation=*/reassociationIndices);

    auto resultMemTp =
        MemRefType::get(expandTy.getShape(), expandTy.getElementType());
    Value resultMemRef = rewriter.create<bufferization::ToMemrefOp>(
        loc, resultMemTp, resultExpand);

    // Replace 'dap.whisper_preprocess' operation with the generated result. The
    // replaced op is erased.
    rewriter.replaceOp(op, resultMemRef);
    return success();
  }
};

} // end anonymous namespace

void populateExtendDAPConversionPatterns(RewritePatternSet &patterns) {
  patterns.add<DAPWhisperPreprocessLowering>(patterns.getContext());
  patterns.add<DAPRFFTLowering>(patterns.getContext());
  // TODO : extract operators
}

//===----------------------------------------------------------------------===//
// ExtendDAPPass
//===----------------------------------------------------------------------===//

namespace {
class ExtendDAPPass
    : public PassWrapper<ExtendDAPPass, OperationPass<ModuleOp>> {
public:
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ExtendDAPPass)
  ExtendDAPPass() = default;
  ExtendDAPPass(const ExtendDAPPass &) {}

  StringRef getArgument() const final { return "extend-dap"; }
  StringRef getDescription() const final { return "Extend DAP Dialect."; }

  void runOnOperation() override;

  void getDependentDialects(DialectRegistry &registry) const override {
    registry.insert<affine::AffineDialect>();
    registry.insert<arith::ArithDialect>();
    registry.insert<bufferization::BufferizationDialect>();
    registry.insert<func::FuncDialect>();
    registry.insert<linalg::LinalgDialect>();
    registry.insert<math::MathDialect>();
    registry.insert<memref::MemRefDialect>();
    registry.insert<scf::SCFDialect>();
    registry.insert<tensor::TensorDialect>();
    registry.insert<vector::VectorDialect>();
    // Buddy Compiler designed dialect
    registry.insert<buddy::dap::DAPDialect>();
  }
};
} // end anonymous namespace.

void ExtendDAPPass::runOnOperation() {
  MLIRContext *context = &getContext();
  ModuleOp module = getOperation();

  ConversionTarget target(*context);
  // Add legal dialects.
  target.addLegalDialect<affine::AffineDialect>();
  target.addLegalDialect<arith::ArithDialect>();
  target.addLegalDialect<bufferization::BufferizationDialect>();
  target.addLegalDialect<func::FuncDialect>();
  target.addLegalDialect<linalg::LinalgDialect>();
  target.addLegalDialect<math::MathDialect>();
  target.addLegalDialect<memref::MemRefDialect>();
  target.addLegalDialect<scf::SCFDialect>();
  target.addLegalDialect<tensor::TensorDialect>();
  target.addLegalDialect<vector::VectorDialect>();
  // Add legal operations.
  target.addLegalOp<ModuleOp, func::FuncOp, func::ReturnOp>();

  RewritePatternSet patterns(context);
  populateExtendDAPConversionPatterns(patterns);

  if (failed(applyPartialConversion(module, target, std::move(patterns))))
    signalPassFailure();
}

namespace mlir {
namespace buddy {
void registerExtendDAPPass() { PassRegistration<ExtendDAPPass>(); }
} // namespace buddy
} // namespace mlir
